Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/classify/cluster.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 ** Filename: cluster.cpp | |
| 3 ** Purpose: Routines for clustering points in N-D space | |
| 4 ** Author: Dan Johnson | |
| 5 ** | |
| 6 ** (c) Copyright Hewlett-Packard Company, 1988. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 *****************************************************************************/ | |
| 17 | |
| 18 #define _USE_MATH_DEFINES // for M_PI | |
| 19 | |
| 20 #include "cluster.h" | |
| 21 | |
| 22 #include "genericheap.h" | |
| 23 #include "kdpair.h" | |
| 24 #include "matrix.h" | |
| 25 #include "tprintf.h" | |
| 26 | |
| 27 #include "helpers.h" | |
| 28 | |
| 29 #include <cfloat> // for FLT_MAX | |
| 30 #include <cmath> // for M_PI | |
| 31 #include <vector> // for std::vector | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 #define HOTELLING 1 // If true use Hotelling's test to decide where to split. | |
| 36 #define FTABLE_X 10 // Size of FTable. | |
| 37 #define FTABLE_Y 100 // Size of FTable. | |
| 38 | |
| 39 // Table of values approximating the cumulative F-distribution for a confidence | |
| 40 // of 1%. | |
| 41 const double FTable[FTABLE_Y][FTABLE_X] = { | |
| 42 { | |
| 43 4052.19, | |
| 44 4999.52, | |
| 45 5403.34, | |
| 46 5624.62, | |
| 47 5763.65, | |
| 48 5858.97, | |
| 49 5928.33, | |
| 50 5981.10, | |
| 51 6022.50, | |
| 52 6055.85, | |
| 53 }, | |
| 54 { | |
| 55 98.502, | |
| 56 99.000, | |
| 57 99.166, | |
| 58 99.249, | |
| 59 99.300, | |
| 60 99.333, | |
| 61 99.356, | |
| 62 99.374, | |
| 63 99.388, | |
| 64 99.399, | |
| 65 }, | |
| 66 { | |
| 67 34.116, | |
| 68 30.816, | |
| 69 29.457, | |
| 70 28.710, | |
| 71 28.237, | |
| 72 27.911, | |
| 73 27.672, | |
| 74 27.489, | |
| 75 27.345, | |
| 76 27.229, | |
| 77 }, | |
| 78 { | |
| 79 21.198, | |
| 80 18.000, | |
| 81 16.694, | |
| 82 15.977, | |
| 83 15.522, | |
| 84 15.207, | |
| 85 14.976, | |
| 86 14.799, | |
| 87 14.659, | |
| 88 14.546, | |
| 89 }, | |
| 90 { | |
| 91 16.258, | |
| 92 13.274, | |
| 93 12.060, | |
| 94 11.392, | |
| 95 10.967, | |
| 96 10.672, | |
| 97 10.456, | |
| 98 10.289, | |
| 99 10.158, | |
| 100 10.051, | |
| 101 }, | |
| 102 { | |
| 103 13.745, | |
| 104 10.925, | |
| 105 9.780, | |
| 106 9.148, | |
| 107 8.746, | |
| 108 8.466, | |
| 109 8.260, | |
| 110 8.102, | |
| 111 7.976, | |
| 112 7.874, | |
| 113 }, | |
| 114 { | |
| 115 12.246, | |
| 116 9.547, | |
| 117 8.451, | |
| 118 7.847, | |
| 119 7.460, | |
| 120 7.191, | |
| 121 6.993, | |
| 122 6.840, | |
| 123 6.719, | |
| 124 6.620, | |
| 125 }, | |
| 126 { | |
| 127 11.259, | |
| 128 8.649, | |
| 129 7.591, | |
| 130 7.006, | |
| 131 6.632, | |
| 132 6.371, | |
| 133 6.178, | |
| 134 6.029, | |
| 135 5.911, | |
| 136 5.814, | |
| 137 }, | |
| 138 { | |
| 139 10.561, | |
| 140 8.022, | |
| 141 6.992, | |
| 142 6.422, | |
| 143 6.057, | |
| 144 5.802, | |
| 145 5.613, | |
| 146 5.467, | |
| 147 5.351, | |
| 148 5.257, | |
| 149 }, | |
| 150 { | |
| 151 10.044, | |
| 152 7.559, | |
| 153 6.552, | |
| 154 5.994, | |
| 155 5.636, | |
| 156 5.386, | |
| 157 5.200, | |
| 158 5.057, | |
| 159 4.942, | |
| 160 4.849, | |
| 161 }, | |
| 162 { | |
| 163 9.646, | |
| 164 7.206, | |
| 165 6.217, | |
| 166 5.668, | |
| 167 5.316, | |
| 168 5.069, | |
| 169 4.886, | |
| 170 4.744, | |
| 171 4.632, | |
| 172 4.539, | |
| 173 }, | |
| 174 { | |
| 175 9.330, | |
| 176 6.927, | |
| 177 5.953, | |
| 178 5.412, | |
| 179 5.064, | |
| 180 4.821, | |
| 181 4.640, | |
| 182 4.499, | |
| 183 4.388, | |
| 184 4.296, | |
| 185 }, | |
| 186 { | |
| 187 9.074, | |
| 188 6.701, | |
| 189 5.739, | |
| 190 5.205, | |
| 191 4.862, | |
| 192 4.620, | |
| 193 4.441, | |
| 194 4.302, | |
| 195 4.191, | |
| 196 4.100, | |
| 197 }, | |
| 198 { | |
| 199 8.862, | |
| 200 6.515, | |
| 201 5.564, | |
| 202 5.035, | |
| 203 4.695, | |
| 204 4.456, | |
| 205 4.278, | |
| 206 4.140, | |
| 207 4.030, | |
| 208 3.939, | |
| 209 }, | |
| 210 { | |
| 211 8.683, | |
| 212 6.359, | |
| 213 5.417, | |
| 214 4.893, | |
| 215 4.556, | |
| 216 4.318, | |
| 217 4.142, | |
| 218 4.004, | |
| 219 3.895, | |
| 220 3.805, | |
| 221 }, | |
| 222 { | |
| 223 8.531, | |
| 224 6.226, | |
| 225 5.292, | |
| 226 4.773, | |
| 227 4.437, | |
| 228 4.202, | |
| 229 4.026, | |
| 230 3.890, | |
| 231 3.780, | |
| 232 3.691, | |
| 233 }, | |
| 234 { | |
| 235 8.400, | |
| 236 6.112, | |
| 237 5.185, | |
| 238 4.669, | |
| 239 4.336, | |
| 240 4.102, | |
| 241 3.927, | |
| 242 3.791, | |
| 243 3.682, | |
| 244 3.593, | |
| 245 }, | |
| 246 { | |
| 247 8.285, | |
| 248 6.013, | |
| 249 5.092, | |
| 250 4.579, | |
| 251 4.248, | |
| 252 4.015, | |
| 253 3.841, | |
| 254 3.705, | |
| 255 3.597, | |
| 256 3.508, | |
| 257 }, | |
| 258 { | |
| 259 8.185, | |
| 260 5.926, | |
| 261 5.010, | |
| 262 4.500, | |
| 263 4.171, | |
| 264 3.939, | |
| 265 3.765, | |
| 266 3.631, | |
| 267 3.523, | |
| 268 3.434, | |
| 269 }, | |
| 270 { | |
| 271 8.096, | |
| 272 5.849, | |
| 273 4.938, | |
| 274 4.431, | |
| 275 4.103, | |
| 276 3.871, | |
| 277 3.699, | |
| 278 3.564, | |
| 279 3.457, | |
| 280 3.368, | |
| 281 }, | |
| 282 { | |
| 283 8.017, | |
| 284 5.780, | |
| 285 4.874, | |
| 286 4.369, | |
| 287 4.042, | |
| 288 3.812, | |
| 289 3.640, | |
| 290 3.506, | |
| 291 3.398, | |
| 292 3.310, | |
| 293 }, | |
| 294 { | |
| 295 7.945, | |
| 296 5.719, | |
| 297 4.817, | |
| 298 4.313, | |
| 299 3.988, | |
| 300 3.758, | |
| 301 3.587, | |
| 302 3.453, | |
| 303 3.346, | |
| 304 3.258, | |
| 305 }, | |
| 306 { | |
| 307 7.881, | |
| 308 5.664, | |
| 309 4.765, | |
| 310 4.264, | |
| 311 3.939, | |
| 312 3.710, | |
| 313 3.539, | |
| 314 3.406, | |
| 315 3.299, | |
| 316 3.211, | |
| 317 }, | |
| 318 { | |
| 319 7.823, | |
| 320 5.614, | |
| 321 4.718, | |
| 322 4.218, | |
| 323 3.895, | |
| 324 3.667, | |
| 325 3.496, | |
| 326 3.363, | |
| 327 3.256, | |
| 328 3.168, | |
| 329 }, | |
| 330 { | |
| 331 7.770, | |
| 332 5.568, | |
| 333 4.675, | |
| 334 4.177, | |
| 335 3.855, | |
| 336 3.627, | |
| 337 3.457, | |
| 338 3.324, | |
| 339 3.217, | |
| 340 3.129, | |
| 341 }, | |
| 342 { | |
| 343 7.721, | |
| 344 5.526, | |
| 345 4.637, | |
| 346 4.140, | |
| 347 3.818, | |
| 348 3.591, | |
| 349 3.421, | |
| 350 3.288, | |
| 351 3.182, | |
| 352 3.094, | |
| 353 }, | |
| 354 { | |
| 355 7.677, | |
| 356 5.488, | |
| 357 4.601, | |
| 358 4.106, | |
| 359 3.785, | |
| 360 3.558, | |
| 361 3.388, | |
| 362 3.256, | |
| 363 3.149, | |
| 364 3.062, | |
| 365 }, | |
| 366 { | |
| 367 7.636, | |
| 368 5.453, | |
| 369 4.568, | |
| 370 4.074, | |
| 371 3.754, | |
| 372 3.528, | |
| 373 3.358, | |
| 374 3.226, | |
| 375 3.120, | |
| 376 3.032, | |
| 377 }, | |
| 378 { | |
| 379 7.598, | |
| 380 5.420, | |
| 381 4.538, | |
| 382 4.045, | |
| 383 3.725, | |
| 384 3.499, | |
| 385 3.330, | |
| 386 3.198, | |
| 387 3.092, | |
| 388 3.005, | |
| 389 }, | |
| 390 { | |
| 391 7.562, | |
| 392 5.390, | |
| 393 4.510, | |
| 394 4.018, | |
| 395 3.699, | |
| 396 3.473, | |
| 397 3.305, | |
| 398 3.173, | |
| 399 3.067, | |
| 400 2.979, | |
| 401 }, | |
| 402 { | |
| 403 7.530, | |
| 404 5.362, | |
| 405 4.484, | |
| 406 3.993, | |
| 407 3.675, | |
| 408 3.449, | |
| 409 3.281, | |
| 410 3.149, | |
| 411 3.043, | |
| 412 2.955, | |
| 413 }, | |
| 414 { | |
| 415 7.499, | |
| 416 5.336, | |
| 417 4.459, | |
| 418 3.969, | |
| 419 3.652, | |
| 420 3.427, | |
| 421 3.258, | |
| 422 3.127, | |
| 423 3.021, | |
| 424 2.934, | |
| 425 }, | |
| 426 { | |
| 427 7.471, | |
| 428 5.312, | |
| 429 4.437, | |
| 430 3.948, | |
| 431 3.630, | |
| 432 3.406, | |
| 433 3.238, | |
| 434 3.106, | |
| 435 3.000, | |
| 436 2.913, | |
| 437 }, | |
| 438 { | |
| 439 7.444, | |
| 440 5.289, | |
| 441 4.416, | |
| 442 3.927, | |
| 443 3.611, | |
| 444 3.386, | |
| 445 3.218, | |
| 446 3.087, | |
| 447 2.981, | |
| 448 2.894, | |
| 449 }, | |
| 450 { | |
| 451 7.419, | |
| 452 5.268, | |
| 453 4.396, | |
| 454 3.908, | |
| 455 3.592, | |
| 456 3.368, | |
| 457 3.200, | |
| 458 3.069, | |
| 459 2.963, | |
| 460 2.876, | |
| 461 }, | |
| 462 { | |
| 463 7.396, | |
| 464 5.248, | |
| 465 4.377, | |
| 466 3.890, | |
| 467 3.574, | |
| 468 3.351, | |
| 469 3.183, | |
| 470 3.052, | |
| 471 2.946, | |
| 472 2.859, | |
| 473 }, | |
| 474 { | |
| 475 7.373, | |
| 476 5.229, | |
| 477 4.360, | |
| 478 3.873, | |
| 479 3.558, | |
| 480 3.334, | |
| 481 3.167, | |
| 482 3.036, | |
| 483 2.930, | |
| 484 2.843, | |
| 485 }, | |
| 486 { | |
| 487 7.353, | |
| 488 5.211, | |
| 489 4.343, | |
| 490 3.858, | |
| 491 3.542, | |
| 492 3.319, | |
| 493 3.152, | |
| 494 3.021, | |
| 495 2.915, | |
| 496 2.828, | |
| 497 }, | |
| 498 { | |
| 499 7.333, | |
| 500 5.194, | |
| 501 4.327, | |
| 502 3.843, | |
| 503 3.528, | |
| 504 3.305, | |
| 505 3.137, | |
| 506 3.006, | |
| 507 2.901, | |
| 508 2.814, | |
| 509 }, | |
| 510 { | |
| 511 7.314, | |
| 512 5.179, | |
| 513 4.313, | |
| 514 3.828, | |
| 515 3.514, | |
| 516 3.291, | |
| 517 3.124, | |
| 518 2.993, | |
| 519 2.888, | |
| 520 2.801, | |
| 521 }, | |
| 522 { | |
| 523 7.296, | |
| 524 5.163, | |
| 525 4.299, | |
| 526 3.815, | |
| 527 3.501, | |
| 528 3.278, | |
| 529 3.111, | |
| 530 2.980, | |
| 531 2.875, | |
| 532 2.788, | |
| 533 }, | |
| 534 { | |
| 535 7.280, | |
| 536 5.149, | |
| 537 4.285, | |
| 538 3.802, | |
| 539 3.488, | |
| 540 3.266, | |
| 541 3.099, | |
| 542 2.968, | |
| 543 2.863, | |
| 544 2.776, | |
| 545 }, | |
| 546 { | |
| 547 7.264, | |
| 548 5.136, | |
| 549 4.273, | |
| 550 3.790, | |
| 551 3.476, | |
| 552 3.254, | |
| 553 3.087, | |
| 554 2.957, | |
| 555 2.851, | |
| 556 2.764, | |
| 557 }, | |
| 558 { | |
| 559 7.248, | |
| 560 5.123, | |
| 561 4.261, | |
| 562 3.778, | |
| 563 3.465, | |
| 564 3.243, | |
| 565 3.076, | |
| 566 2.946, | |
| 567 2.840, | |
| 568 2.754, | |
| 569 }, | |
| 570 { | |
| 571 7.234, | |
| 572 5.110, | |
| 573 4.249, | |
| 574 3.767, | |
| 575 3.454, | |
| 576 3.232, | |
| 577 3.066, | |
| 578 2.935, | |
| 579 2.830, | |
| 580 2.743, | |
| 581 }, | |
| 582 { | |
| 583 7.220, | |
| 584 5.099, | |
| 585 4.238, | |
| 586 3.757, | |
| 587 3.444, | |
| 588 3.222, | |
| 589 3.056, | |
| 590 2.925, | |
| 591 2.820, | |
| 592 2.733, | |
| 593 }, | |
| 594 { | |
| 595 7.207, | |
| 596 5.087, | |
| 597 4.228, | |
| 598 3.747, | |
| 599 3.434, | |
| 600 3.213, | |
| 601 3.046, | |
| 602 2.916, | |
| 603 2.811, | |
| 604 2.724, | |
| 605 }, | |
| 606 { | |
| 607 7.194, | |
| 608 5.077, | |
| 609 4.218, | |
| 610 3.737, | |
| 611 3.425, | |
| 612 3.204, | |
| 613 3.037, | |
| 614 2.907, | |
| 615 2.802, | |
| 616 2.715, | |
| 617 }, | |
| 618 { | |
| 619 7.182, | |
| 620 5.066, | |
| 621 4.208, | |
| 622 3.728, | |
| 623 3.416, | |
| 624 3.195, | |
| 625 3.028, | |
| 626 2.898, | |
| 627 2.793, | |
| 628 2.706, | |
| 629 }, | |
| 630 { | |
| 631 7.171, | |
| 632 5.057, | |
| 633 4.199, | |
| 634 3.720, | |
| 635 3.408, | |
| 636 3.186, | |
| 637 3.020, | |
| 638 2.890, | |
| 639 2.785, | |
| 640 2.698, | |
| 641 }, | |
| 642 { | |
| 643 7.159, | |
| 644 5.047, | |
| 645 4.191, | |
| 646 3.711, | |
| 647 3.400, | |
| 648 3.178, | |
| 649 3.012, | |
| 650 2.882, | |
| 651 2.777, | |
| 652 2.690, | |
| 653 }, | |
| 654 { | |
| 655 7.149, | |
| 656 5.038, | |
| 657 4.182, | |
| 658 3.703, | |
| 659 3.392, | |
| 660 3.171, | |
| 661 3.005, | |
| 662 2.874, | |
| 663 2.769, | |
| 664 2.683, | |
| 665 }, | |
| 666 { | |
| 667 7.139, | |
| 668 5.030, | |
| 669 4.174, | |
| 670 3.695, | |
| 671 3.384, | |
| 672 3.163, | |
| 673 2.997, | |
| 674 2.867, | |
| 675 2.762, | |
| 676 2.675, | |
| 677 }, | |
| 678 { | |
| 679 7.129, | |
| 680 5.021, | |
| 681 4.167, | |
| 682 3.688, | |
| 683 3.377, | |
| 684 3.156, | |
| 685 2.990, | |
| 686 2.860, | |
| 687 2.755, | |
| 688 2.668, | |
| 689 }, | |
| 690 { | |
| 691 7.119, | |
| 692 5.013, | |
| 693 4.159, | |
| 694 3.681, | |
| 695 3.370, | |
| 696 3.149, | |
| 697 2.983, | |
| 698 2.853, | |
| 699 2.748, | |
| 700 2.662, | |
| 701 }, | |
| 702 { | |
| 703 7.110, | |
| 704 5.006, | |
| 705 4.152, | |
| 706 3.674, | |
| 707 3.363, | |
| 708 3.143, | |
| 709 2.977, | |
| 710 2.847, | |
| 711 2.742, | |
| 712 2.655, | |
| 713 }, | |
| 714 { | |
| 715 7.102, | |
| 716 4.998, | |
| 717 4.145, | |
| 718 3.667, | |
| 719 3.357, | |
| 720 3.136, | |
| 721 2.971, | |
| 722 2.841, | |
| 723 2.736, | |
| 724 2.649, | |
| 725 }, | |
| 726 { | |
| 727 7.093, | |
| 728 4.991, | |
| 729 4.138, | |
| 730 3.661, | |
| 731 3.351, | |
| 732 3.130, | |
| 733 2.965, | |
| 734 2.835, | |
| 735 2.730, | |
| 736 2.643, | |
| 737 }, | |
| 738 { | |
| 739 7.085, | |
| 740 4.984, | |
| 741 4.132, | |
| 742 3.655, | |
| 743 3.345, | |
| 744 3.124, | |
| 745 2.959, | |
| 746 2.829, | |
| 747 2.724, | |
| 748 2.637, | |
| 749 }, | |
| 750 { | |
| 751 7.077, | |
| 752 4.977, | |
| 753 4.126, | |
| 754 3.649, | |
| 755 3.339, | |
| 756 3.119, | |
| 757 2.953, | |
| 758 2.823, | |
| 759 2.718, | |
| 760 2.632, | |
| 761 }, | |
| 762 { | |
| 763 7.070, | |
| 764 4.971, | |
| 765 4.120, | |
| 766 3.643, | |
| 767 3.333, | |
| 768 3.113, | |
| 769 2.948, | |
| 770 2.818, | |
| 771 2.713, | |
| 772 2.626, | |
| 773 }, | |
| 774 { | |
| 775 7.062, | |
| 776 4.965, | |
| 777 4.114, | |
| 778 3.638, | |
| 779 3.328, | |
| 780 3.108, | |
| 781 2.942, | |
| 782 2.813, | |
| 783 2.708, | |
| 784 2.621, | |
| 785 }, | |
| 786 { | |
| 787 7.055, | |
| 788 4.959, | |
| 789 4.109, | |
| 790 3.632, | |
| 791 3.323, | |
| 792 3.103, | |
| 793 2.937, | |
| 794 2.808, | |
| 795 2.703, | |
| 796 2.616, | |
| 797 }, | |
| 798 { | |
| 799 7.048, | |
| 800 4.953, | |
| 801 4.103, | |
| 802 3.627, | |
| 803 3.318, | |
| 804 3.098, | |
| 805 2.932, | |
| 806 2.803, | |
| 807 2.698, | |
| 808 2.611, | |
| 809 }, | |
| 810 { | |
| 811 7.042, | |
| 812 4.947, | |
| 813 4.098, | |
| 814 3.622, | |
| 815 3.313, | |
| 816 3.093, | |
| 817 2.928, | |
| 818 2.798, | |
| 819 2.693, | |
| 820 2.607, | |
| 821 }, | |
| 822 { | |
| 823 7.035, | |
| 824 4.942, | |
| 825 4.093, | |
| 826 3.618, | |
| 827 3.308, | |
| 828 3.088, | |
| 829 2.923, | |
| 830 2.793, | |
| 831 2.689, | |
| 832 2.602, | |
| 833 }, | |
| 834 { | |
| 835 7.029, | |
| 836 4.937, | |
| 837 4.088, | |
| 838 3.613, | |
| 839 3.304, | |
| 840 3.084, | |
| 841 2.919, | |
| 842 2.789, | |
| 843 2.684, | |
| 844 2.598, | |
| 845 }, | |
| 846 { | |
| 847 7.023, | |
| 848 4.932, | |
| 849 4.083, | |
| 850 3.608, | |
| 851 3.299, | |
| 852 3.080, | |
| 853 2.914, | |
| 854 2.785, | |
| 855 2.680, | |
| 856 2.593, | |
| 857 }, | |
| 858 { | |
| 859 7.017, | |
| 860 4.927, | |
| 861 4.079, | |
| 862 3.604, | |
| 863 3.295, | |
| 864 3.075, | |
| 865 2.910, | |
| 866 2.781, | |
| 867 2.676, | |
| 868 2.589, | |
| 869 }, | |
| 870 { | |
| 871 7.011, | |
| 872 4.922, | |
| 873 4.074, | |
| 874 3.600, | |
| 875 3.291, | |
| 876 3.071, | |
| 877 2.906, | |
| 878 2.777, | |
| 879 2.672, | |
| 880 2.585, | |
| 881 }, | |
| 882 { | |
| 883 7.006, | |
| 884 4.917, | |
| 885 4.070, | |
| 886 3.596, | |
| 887 3.287, | |
| 888 3.067, | |
| 889 2.902, | |
| 890 2.773, | |
| 891 2.668, | |
| 892 2.581, | |
| 893 }, | |
| 894 { | |
| 895 7.001, | |
| 896 4.913, | |
| 897 4.066, | |
| 898 3.591, | |
| 899 3.283, | |
| 900 3.063, | |
| 901 2.898, | |
| 902 2.769, | |
| 903 2.664, | |
| 904 2.578, | |
| 905 }, | |
| 906 { | |
| 907 6.995, | |
| 908 4.908, | |
| 909 4.062, | |
| 910 3.588, | |
| 911 3.279, | |
| 912 3.060, | |
| 913 2.895, | |
| 914 2.765, | |
| 915 2.660, | |
| 916 2.574, | |
| 917 }, | |
| 918 { | |
| 919 6.990, | |
| 920 4.904, | |
| 921 4.058, | |
| 922 3.584, | |
| 923 3.275, | |
| 924 3.056, | |
| 925 2.891, | |
| 926 2.762, | |
| 927 2.657, | |
| 928 2.570, | |
| 929 }, | |
| 930 { | |
| 931 6.985, | |
| 932 4.900, | |
| 933 4.054, | |
| 934 3.580, | |
| 935 3.272, | |
| 936 3.052, | |
| 937 2.887, | |
| 938 2.758, | |
| 939 2.653, | |
| 940 2.567, | |
| 941 }, | |
| 942 { | |
| 943 6.981, | |
| 944 4.896, | |
| 945 4.050, | |
| 946 3.577, | |
| 947 3.268, | |
| 948 3.049, | |
| 949 2.884, | |
| 950 2.755, | |
| 951 2.650, | |
| 952 2.563, | |
| 953 }, | |
| 954 { | |
| 955 6.976, | |
| 956 4.892, | |
| 957 4.047, | |
| 958 3.573, | |
| 959 3.265, | |
| 960 3.046, | |
| 961 2.881, | |
| 962 2.751, | |
| 963 2.647, | |
| 964 2.560, | |
| 965 }, | |
| 966 { | |
| 967 6.971, | |
| 968 4.888, | |
| 969 4.043, | |
| 970 3.570, | |
| 971 3.261, | |
| 972 3.042, | |
| 973 2.877, | |
| 974 2.748, | |
| 975 2.644, | |
| 976 2.557, | |
| 977 }, | |
| 978 { | |
| 979 6.967, | |
| 980 4.884, | |
| 981 4.040, | |
| 982 3.566, | |
| 983 3.258, | |
| 984 3.039, | |
| 985 2.874, | |
| 986 2.745, | |
| 987 2.640, | |
| 988 2.554, | |
| 989 }, | |
| 990 { | |
| 991 6.963, | |
| 992 4.881, | |
| 993 4.036, | |
| 994 3.563, | |
| 995 3.255, | |
| 996 3.036, | |
| 997 2.871, | |
| 998 2.742, | |
| 999 2.637, | |
| 1000 2.551, | |
| 1001 }, | |
| 1002 { | |
| 1003 6.958, | |
| 1004 4.877, | |
| 1005 4.033, | |
| 1006 3.560, | |
| 1007 3.252, | |
| 1008 3.033, | |
| 1009 2.868, | |
| 1010 2.739, | |
| 1011 2.634, | |
| 1012 2.548, | |
| 1013 }, | |
| 1014 { | |
| 1015 6.954, | |
| 1016 4.874, | |
| 1017 4.030, | |
| 1018 3.557, | |
| 1019 3.249, | |
| 1020 3.030, | |
| 1021 2.865, | |
| 1022 2.736, | |
| 1023 2.632, | |
| 1024 2.545, | |
| 1025 }, | |
| 1026 { | |
| 1027 6.950, | |
| 1028 4.870, | |
| 1029 4.027, | |
| 1030 3.554, | |
| 1031 3.246, | |
| 1032 3.027, | |
| 1033 2.863, | |
| 1034 2.733, | |
| 1035 2.629, | |
| 1036 2.542, | |
| 1037 }, | |
| 1038 { | |
| 1039 6.947, | |
| 1040 4.867, | |
| 1041 4.024, | |
| 1042 3.551, | |
| 1043 3.243, | |
| 1044 3.025, | |
| 1045 2.860, | |
| 1046 2.731, | |
| 1047 2.626, | |
| 1048 2.539, | |
| 1049 }, | |
| 1050 { | |
| 1051 6.943, | |
| 1052 4.864, | |
| 1053 4.021, | |
| 1054 3.548, | |
| 1055 3.240, | |
| 1056 3.022, | |
| 1057 2.857, | |
| 1058 2.728, | |
| 1059 2.623, | |
| 1060 2.537, | |
| 1061 }, | |
| 1062 { | |
| 1063 6.939, | |
| 1064 4.861, | |
| 1065 4.018, | |
| 1066 3.545, | |
| 1067 3.238, | |
| 1068 3.019, | |
| 1069 2.854, | |
| 1070 2.725, | |
| 1071 2.621, | |
| 1072 2.534, | |
| 1073 }, | |
| 1074 { | |
| 1075 6.935, | |
| 1076 4.858, | |
| 1077 4.015, | |
| 1078 3.543, | |
| 1079 3.235, | |
| 1080 3.017, | |
| 1081 2.852, | |
| 1082 2.723, | |
| 1083 2.618, | |
| 1084 2.532, | |
| 1085 }, | |
| 1086 { | |
| 1087 6.932, | |
| 1088 4.855, | |
| 1089 4.012, | |
| 1090 3.540, | |
| 1091 3.233, | |
| 1092 3.014, | |
| 1093 2.849, | |
| 1094 2.720, | |
| 1095 2.616, | |
| 1096 2.529, | |
| 1097 }, | |
| 1098 { | |
| 1099 6.928, | |
| 1100 4.852, | |
| 1101 4.010, | |
| 1102 3.538, | |
| 1103 3.230, | |
| 1104 3.012, | |
| 1105 2.847, | |
| 1106 2.718, | |
| 1107 2.613, | |
| 1108 2.527, | |
| 1109 }, | |
| 1110 { | |
| 1111 6.925, | |
| 1112 4.849, | |
| 1113 4.007, | |
| 1114 3.535, | |
| 1115 3.228, | |
| 1116 3.009, | |
| 1117 2.845, | |
| 1118 2.715, | |
| 1119 2.611, | |
| 1120 2.524, | |
| 1121 }, | |
| 1122 { | |
| 1123 6.922, | |
| 1124 4.846, | |
| 1125 4.004, | |
| 1126 3.533, | |
| 1127 3.225, | |
| 1128 3.007, | |
| 1129 2.842, | |
| 1130 2.713, | |
| 1131 2.609, | |
| 1132 2.522, | |
| 1133 }, | |
| 1134 { | |
| 1135 6.919, | |
| 1136 4.844, | |
| 1137 4.002, | |
| 1138 3.530, | |
| 1139 3.223, | |
| 1140 3.004, | |
| 1141 2.840, | |
| 1142 2.711, | |
| 1143 2.606, | |
| 1144 2.520, | |
| 1145 }, | |
| 1146 { | |
| 1147 6.915, | |
| 1148 4.841, | |
| 1149 3.999, | |
| 1150 3.528, | |
| 1151 3.221, | |
| 1152 3.002, | |
| 1153 2.838, | |
| 1154 2.709, | |
| 1155 2.604, | |
| 1156 2.518, | |
| 1157 }, | |
| 1158 { | |
| 1159 6.912, | |
| 1160 4.838, | |
| 1161 3.997, | |
| 1162 3.525, | |
| 1163 3.218, | |
| 1164 3.000, | |
| 1165 2.835, | |
| 1166 2.706, | |
| 1167 2.602, | |
| 1168 2.515, | |
| 1169 }, | |
| 1170 { | |
| 1171 6.909, | |
| 1172 4.836, | |
| 1173 3.995, | |
| 1174 3.523, | |
| 1175 3.216, | |
| 1176 2.998, | |
| 1177 2.833, | |
| 1178 2.704, | |
| 1179 2.600, | |
| 1180 2.513, | |
| 1181 }, | |
| 1182 { | |
| 1183 6.906, | |
| 1184 4.833, | |
| 1185 3.992, | |
| 1186 3.521, | |
| 1187 3.214, | |
| 1188 2.996, | |
| 1189 2.831, | |
| 1190 2.702, | |
| 1191 2.598, | |
| 1192 2.511, | |
| 1193 }, | |
| 1194 { | |
| 1195 6.904, | |
| 1196 4.831, | |
| 1197 3.990, | |
| 1198 3.519, | |
| 1199 3.212, | |
| 1200 2.994, | |
| 1201 2.829, | |
| 1202 2.700, | |
| 1203 2.596, | |
| 1204 2.509, | |
| 1205 }, | |
| 1206 { | |
| 1207 6.901, | |
| 1208 4.829, | |
| 1209 3.988, | |
| 1210 3.517, | |
| 1211 3.210, | |
| 1212 2.992, | |
| 1213 2.827, | |
| 1214 2.698, | |
| 1215 2.594, | |
| 1216 2.507, | |
| 1217 }, | |
| 1218 { | |
| 1219 6.898, | |
| 1220 4.826, | |
| 1221 3.986, | |
| 1222 3.515, | |
| 1223 3.208, | |
| 1224 2.990, | |
| 1225 2.825, | |
| 1226 2.696, | |
| 1227 2.592, | |
| 1228 2.505, | |
| 1229 }, | |
| 1230 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}}; | |
| 1231 | |
| 1232 /** define the variance which will be used as a minimum variance for any | |
| 1233 dimension of any feature. Since most features are calculated from numbers | |
| 1234 with a precision no better than 1 in 128, the variance should never be | |
| 1235 less than the square of this number for parameters whose range is 1. */ | |
| 1236 #define MINVARIANCE 0.0004 | |
| 1237 | |
| 1238 /** define the absolute minimum number of samples which must be present in | |
| 1239 order to accurately test hypotheses about underlying probability | |
| 1240 distributions. Define separately the minimum samples that are needed | |
| 1241 before a statistical analysis is attempted; this number should be | |
| 1242 equal to MINSAMPLES but can be set to a lower number for early testing | |
| 1243 when very few samples are available. */ | |
| 1244 #define MINSAMPLESPERBUCKET 5 | |
| 1245 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) | |
| 1246 #define MINSAMPLESNEEDED 1 | |
| 1247 | |
| 1248 /** define the size of the table which maps normalized samples to | |
| 1249 histogram buckets. Also define the number of standard deviations | |
| 1250 in a normal distribution which are considered to be significant. | |
| 1251 The mapping table will be defined in such a way that it covers | |
| 1252 the specified number of standard deviations on either side of | |
| 1253 the mean. BUCKETTABLESIZE should always be even. */ | |
| 1254 #define BUCKETTABLESIZE 1024 | |
| 1255 #define NORMALEXTENT 3.0 | |
| 1256 | |
| 1257 struct TEMPCLUSTER { | |
| 1258 CLUSTER *Cluster; | |
| 1259 CLUSTER *Neighbor; | |
| 1260 }; | |
| 1261 | |
| 1262 using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>; | |
| 1263 using ClusterHeap = tesseract::GenericHeap<ClusterPair>; | |
| 1264 | |
| 1265 struct STATISTICS { | |
| 1266 STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) { | |
| 1267 } | |
| 1268 float AvgVariance = 1.0f; | |
| 1269 std::vector<float> CoVariance; | |
| 1270 std::vector<float> Min; // largest negative distance from the mean | |
| 1271 std::vector<float> Max; // largest positive distance from the mean | |
| 1272 }; | |
| 1273 | |
| 1274 struct BUCKETS { | |
| 1275 BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) { | |
| 1276 } | |
| 1277 ~BUCKETS() { | |
| 1278 } | |
| 1279 DISTRIBUTION Distribution = normal; // distribution being tested for | |
| 1280 uint32_t SampleCount = 0; // # of samples in histogram | |
| 1281 double Confidence = 0.0; // confidence level of test | |
| 1282 double ChiSquared = 0.0; // test threshold | |
| 1283 uint16_t NumberOfBuckets; // number of cells in histogram | |
| 1284 uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets | |
| 1285 std::vector<uint32_t> Count; // frequency of occurrence histogram | |
| 1286 std::vector<float> ExpectedCount; // expected histogram | |
| 1287 }; | |
| 1288 | |
| 1289 struct CHISTRUCT { | |
| 1290 /// This constructor allocates a new data structure which is used | |
| 1291 /// to hold a chi-squared value along with its associated | |
| 1292 /// number of degrees of freedom and alpha value. | |
| 1293 /// | |
| 1294 /// @param degreesOfFreedom degrees of freedom for new chi value | |
| 1295 /// @param alpha confidence level for new chi value | |
| 1296 CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) { | |
| 1297 } | |
| 1298 uint16_t DegreesOfFreedom = 0; | |
| 1299 double Alpha = 0.0; | |
| 1300 double ChiSquared = 0.0; | |
| 1301 }; | |
| 1302 | |
| 1303 // For use with KDWalk / MakePotentialClusters | |
| 1304 struct ClusteringContext { | |
| 1305 ClusterHeap *heap; // heap used to hold temp clusters, "best" on top | |
| 1306 TEMPCLUSTER *candidates; // array of potential clusters | |
| 1307 KDTREE *tree; // kd-tree to be searched for neighbors | |
| 1308 int32_t next; // next candidate to be used | |
| 1309 }; | |
| 1310 | |
| 1311 using DENSITYFUNC = double (*)(int32_t); | |
| 1312 using SOLVEFUNC = double (*)(CHISTRUCT *, double); | |
| 1313 | |
| 1314 #define Odd(N) ((N) % 2) | |
| 1315 #define Mirror(N, R) ((R) - (N)-1) | |
| 1316 #define Abs(N) (((N) < 0) ? (-(N)) : (N)) | |
| 1317 | |
| 1318 //--------------Global Data Definitions and Declarations---------------------- | |
| 1319 /** the following variables describe a discrete normal distribution | |
| 1320 which is used by NormalDensity() and NormalBucket(). The | |
| 1321 constant NORMALEXTENT determines how many standard | |
| 1322 deviations of the distribution are mapped onto the fixed | |
| 1323 discrete range of x. x=0 is mapped to -NORMALEXTENT standard | |
| 1324 deviations and x=BUCKETTABLESIZE is mapped to | |
| 1325 +NORMALEXTENT standard deviations. */ | |
| 1326 #define SqrtOf2Pi 2.506628275 | |
| 1327 static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT); | |
| 1328 static const double kNormalVariance = | |
| 1329 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT); | |
| 1330 static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE); | |
| 1331 static const double kNormalMean = BUCKETTABLESIZE / 2; | |
| 1332 | |
| 1333 /** define lookup tables used to compute the number of histogram buckets | |
| 1334 that should be used for a given number of samples. */ | |
| 1335 #define LOOKUPTABLESIZE 8 | |
| 1336 #define MAXDEGREESOFFREEDOM MAXBUCKETS | |
| 1337 | |
| 1338 static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200, 400, 600, 800, | |
| 1339 1000, 1500, 2000}; // number of samples | |
| 1340 | |
| 1341 static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = { | |
| 1342 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets | |
| 1343 | |
| 1344 /*------------------------------------------------------------------------- | |
| 1345 Private Function Prototypes | |
| 1346 --------------------------------------------------------------------------*/ | |
| 1347 static void CreateClusterTree(CLUSTERER *Clusterer); | |
| 1348 | |
| 1349 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level); | |
| 1350 | |
| 1351 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance); | |
| 1352 | |
| 1353 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster); | |
| 1354 | |
| 1355 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); | |
| 1356 | |
| 1357 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster); | |
| 1358 | |
| 1359 static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, | |
| 1360 PROTOSTYLE Style, int32_t MinSamples); | |
| 1361 | |
| 1362 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, | |
| 1363 STATISTICS *Statistics); | |
| 1364 | |
| 1365 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, | |
| 1366 BUCKETS *Buckets); | |
| 1367 | |
| 1368 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, | |
| 1369 STATISTICS *Statistics, BUCKETS *Buckets); | |
| 1370 | |
| 1371 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, | |
| 1372 BUCKETS *NormalBuckets, double Confidence); | |
| 1373 | |
| 1374 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc); | |
| 1375 | |
| 1376 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics); | |
| 1377 | |
| 1378 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster); | |
| 1379 | |
| 1380 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics); | |
| 1381 | |
| 1382 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics); | |
| 1383 | |
| 1384 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics); | |
| 1385 | |
| 1386 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster); | |
| 1387 | |
| 1388 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence); | |
| 1389 | |
| 1390 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, | |
| 1391 double Confidence); | |
| 1392 | |
| 1393 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence); | |
| 1394 | |
| 1395 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount); | |
| 1396 | |
| 1397 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha); | |
| 1398 | |
| 1399 static double NormalDensity(int32_t x); | |
| 1400 | |
| 1401 static double UniformDensity(int32_t x); | |
| 1402 | |
| 1403 static double Integral(double f1, double f2, double Dx); | |
| 1404 | |
| 1405 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, | |
| 1406 float Mean, float StdDev); | |
| 1407 | |
| 1408 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev); | |
| 1409 | |
| 1410 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev); | |
| 1411 | |
| 1412 static bool DistributionOK(BUCKETS *Buckets); | |
| 1413 | |
| 1414 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets); | |
| 1415 | |
| 1416 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount); | |
| 1417 | |
| 1418 static void InitBuckets(BUCKETS *Buckets); | |
| 1419 | |
| 1420 static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, | |
| 1421 void *arg2); // CHISTRUCT *SearchKey); | |
| 1422 | |
| 1423 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy); | |
| 1424 | |
| 1425 static double ChiArea(CHISTRUCT *ChiParams, double x); | |
| 1426 | |
| 1427 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal); | |
| 1428 | |
| 1429 static double InvertMatrix(const float *input, int size, float *inv); | |
| 1430 | |
| 1431 //--------------------------Public Code-------------------------------------- | |
| 1432 /** | |
| 1433 * This routine creates a new clusterer data structure, | |
| 1434 * initializes it, and returns a pointer to it. | |
| 1435 * | |
| 1436 * @param SampleSize number of dimensions in feature space | |
| 1437 * @param ParamDesc description of each dimension | |
| 1438 * @return pointer to the new clusterer data structure | |
| 1439 */ | |
| 1440 CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) { | |
| 1441 int i; | |
| 1442 | |
| 1443 // allocate main clusterer data structure and init simple fields | |
| 1444 auto Clusterer = new CLUSTERER; | |
| 1445 Clusterer->SampleSize = SampleSize; | |
| 1446 Clusterer->NumberOfSamples = 0; | |
| 1447 Clusterer->NumChar = 0; | |
| 1448 | |
| 1449 // init fields which will not be used initially | |
| 1450 Clusterer->Root = nullptr; | |
| 1451 Clusterer->ProtoList = NIL_LIST; | |
| 1452 | |
| 1453 // maintain a copy of param descriptors in the clusterer data structure | |
| 1454 Clusterer->ParamDesc = new PARAM_DESC[SampleSize]; | |
| 1455 for (i = 0; i < SampleSize; i++) { | |
| 1456 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; | |
| 1457 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; | |
| 1458 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; | |
| 1459 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; | |
| 1460 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; | |
| 1461 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; | |
| 1462 Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2; | |
| 1463 } | |
| 1464 | |
| 1465 // allocate a kd tree to hold the samples | |
| 1466 Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc); | |
| 1467 | |
| 1468 // Initialize cache of histogram buckets to minimize recomputing them. | |
| 1469 for (auto &d : Clusterer->bucket_cache) { | |
| 1470 for (auto &c : d) { | |
| 1471 c = nullptr; | |
| 1472 } | |
| 1473 } | |
| 1474 | |
| 1475 return Clusterer; | |
| 1476 } // MakeClusterer | |
| 1477 | |
| 1478 /** | |
| 1479 * This routine creates a new sample data structure to hold | |
| 1480 * the specified feature. This sample is added to the clusterer | |
| 1481 * data structure (so that it knows which samples are to be | |
| 1482 * clustered later), and a pointer to the sample is returned to | |
| 1483 * the caller. | |
| 1484 * | |
| 1485 * @param Clusterer clusterer data structure to add sample to | |
| 1486 * @param Feature feature to be added to clusterer | |
| 1487 * @param CharID unique ident. of char that sample came from | |
| 1488 * | |
| 1489 * @return Pointer to the new sample data structure | |
| 1490 */ | |
| 1491 SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) { | |
| 1492 int i; | |
| 1493 | |
| 1494 // see if the samples have already been clustered - if so trap an error | |
| 1495 // Can't add samples after they have been clustered. | |
| 1496 ASSERT_HOST(Clusterer->Root == nullptr); | |
| 1497 | |
| 1498 // allocate the new sample and initialize it | |
| 1499 auto Sample = new SAMPLE(Clusterer->SampleSize); | |
| 1500 Sample->Clustered = false; | |
| 1501 Sample->Prototype = false; | |
| 1502 Sample->SampleCount = 1; | |
| 1503 Sample->Left = nullptr; | |
| 1504 Sample->Right = nullptr; | |
| 1505 Sample->CharID = CharID; | |
| 1506 | |
| 1507 for (i = 0; i < Clusterer->SampleSize; i++) { | |
| 1508 Sample->Mean[i] = Feature[i]; | |
| 1509 } | |
| 1510 | |
| 1511 // add the sample to the KD tree - keep track of the total # of samples | |
| 1512 Clusterer->NumberOfSamples++; | |
| 1513 KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample); | |
| 1514 if (CharID >= Clusterer->NumChar) { | |
| 1515 Clusterer->NumChar = CharID + 1; | |
| 1516 } | |
| 1517 | |
| 1518 // execute hook for monitoring clustering operation | |
| 1519 // (*SampleCreationHook)(Sample); | |
| 1520 | |
| 1521 return (Sample); | |
| 1522 } // MakeSample | |
| 1523 | |
| 1524 /** | |
| 1525 * This routine first checks to see if the samples in this | |
| 1526 * clusterer have already been clustered before; if so, it does | |
| 1527 * not bother to recreate the cluster tree. It simply recomputes | |
| 1528 * the prototypes based on the new Config info. | |
| 1529 * | |
| 1530 * If the samples have not been clustered before, the | |
| 1531 * samples in the KD tree are formed into a cluster tree and then | |
| 1532 * the prototypes are computed from the cluster tree. | |
| 1533 * | |
| 1534 * In either case this routine returns a pointer to a | |
| 1535 * list of prototypes that best represent the samples given | |
| 1536 * the constraints specified in Config. | |
| 1537 * | |
| 1538 * @param Clusterer data struct containing samples to be clustered | |
| 1539 * @param Config parameters which control clustering process | |
| 1540 * | |
| 1541 * @return Pointer to a list of prototypes | |
| 1542 */ | |
| 1543 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { | |
| 1544 // only create cluster tree if samples have never been clustered before | |
| 1545 if (Clusterer->Root == nullptr) { | |
| 1546 CreateClusterTree(Clusterer); | |
| 1547 } | |
| 1548 | |
| 1549 // deallocate the old prototype list if one exists | |
| 1550 FreeProtoList(&Clusterer->ProtoList); | |
| 1551 Clusterer->ProtoList = NIL_LIST; | |
| 1552 | |
| 1553 // compute prototypes starting at the root node in the tree | |
| 1554 ComputePrototypes(Clusterer, Config); | |
| 1555 // We don't need the cluster pointers in the protos any more, so null them | |
| 1556 // out, which makes it safe to delete the clusterer. | |
| 1557 LIST proto_list = Clusterer->ProtoList; | |
| 1558 iterate(proto_list) { | |
| 1559 auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node()); | |
| 1560 proto->Cluster = nullptr; | |
| 1561 } | |
| 1562 return Clusterer->ProtoList; | |
| 1563 } // ClusterSamples | |
| 1564 | |
| 1565 /** | |
| 1566 * This routine frees all of the memory allocated to the | |
| 1567 * specified data structure. It will not, however, free | |
| 1568 * the memory used by the prototype list. The pointers to | |
| 1569 * the clusters for each prototype in the list will be set | |
| 1570 * to nullptr to indicate that the cluster data structures no | |
| 1571 * longer exist. Any sample lists that have been obtained | |
| 1572 * via calls to GetSamples are no longer valid. | |
| 1573 * @param Clusterer pointer to data structure to be freed | |
| 1574 */ | |
| 1575 void FreeClusterer(CLUSTERER *Clusterer) { | |
| 1576 if (Clusterer != nullptr) { | |
| 1577 delete[] Clusterer->ParamDesc; | |
| 1578 delete Clusterer->KDTree; | |
| 1579 delete Clusterer->Root; | |
| 1580 // Free up all used buckets structures. | |
| 1581 for (auto &d : Clusterer->bucket_cache) { | |
| 1582 for (auto &c : d) { | |
| 1583 delete c; | |
| 1584 } | |
| 1585 } | |
| 1586 | |
| 1587 delete Clusterer; | |
| 1588 } | |
| 1589 } // FreeClusterer | |
| 1590 | |
| 1591 /** | |
| 1592 * This routine frees all of the memory allocated to the | |
| 1593 * specified list of prototypes. The clusters which are | |
| 1594 * pointed to by the prototypes are not freed. | |
| 1595 * @param ProtoList pointer to list of prototypes to be freed | |
| 1596 */ | |
| 1597 void FreeProtoList(LIST *ProtoList) { | |
| 1598 destroy_nodes(*ProtoList, FreePrototype); | |
| 1599 } // FreeProtoList | |
| 1600 | |
| 1601 /** | |
| 1602 * This routine deallocates the memory consumed by the specified | |
| 1603 * prototype and modifies the corresponding cluster so that it | |
| 1604 * is no longer marked as a prototype. The cluster is NOT | |
| 1605 * deallocated by this routine. | |
| 1606 * @param arg prototype data structure to be deallocated | |
| 1607 */ | |
| 1608 void FreePrototype(void *arg) { // PROTOTYPE *Prototype) | |
| 1609 auto *Prototype = static_cast<PROTOTYPE *>(arg); | |
| 1610 | |
| 1611 // unmark the corresponding cluster (if there is one | |
| 1612 if (Prototype->Cluster != nullptr) { | |
| 1613 Prototype->Cluster->Prototype = false; | |
| 1614 } | |
| 1615 | |
| 1616 // deallocate the prototype statistics and then the prototype itself | |
| 1617 if (Prototype->Style != spherical) { | |
| 1618 delete[] Prototype->Variance.Elliptical; | |
| 1619 delete[] Prototype->Magnitude.Elliptical; | |
| 1620 delete[] Prototype->Weight.Elliptical; | |
| 1621 } | |
| 1622 delete Prototype; | |
| 1623 } // FreePrototype | |
| 1624 | |
| 1625 /** | |
| 1626 * This routine is used to find all of the samples which | |
| 1627 * belong to a cluster. It starts by removing the top | |
| 1628 * cluster on the cluster list (SearchState). If this cluster is | |
| 1629 * a leaf it is returned. Otherwise, the right subcluster | |
| 1630 * is pushed on the list and we continue the search in the | |
| 1631 * left subcluster. This continues until a leaf is found. | |
| 1632 * If all samples have been found, nullptr is returned. | |
| 1633 * InitSampleSearch() must be called | |
| 1634 * before NextSample() to initialize the search. | |
| 1635 * @param SearchState ptr to list containing clusters to be searched | |
| 1636 * @return Pointer to the next leaf cluster (sample) or nullptr. | |
| 1637 */ | |
| 1638 CLUSTER *NextSample(LIST *SearchState) { | |
| 1639 CLUSTER *Cluster; | |
| 1640 | |
| 1641 if (*SearchState == NIL_LIST) { | |
| 1642 return (nullptr); | |
| 1643 } | |
| 1644 Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node()); | |
| 1645 *SearchState = pop(*SearchState); | |
| 1646 for (;;) { | |
| 1647 if (Cluster->Left == nullptr) { | |
| 1648 return (Cluster); | |
| 1649 } | |
| 1650 *SearchState = push(*SearchState, Cluster->Right); | |
| 1651 Cluster = Cluster->Left; | |
| 1652 } | |
| 1653 } // NextSample | |
| 1654 | |
| 1655 /** | |
| 1656 * This routine returns the mean of the specified | |
| 1657 * prototype in the indicated dimension. | |
| 1658 * @param Proto prototype to return mean of | |
| 1659 * @param Dimension dimension whose mean is to be returned | |
| 1660 * @return Mean of Prototype in Dimension | |
| 1661 */ | |
| 1662 float Mean(PROTOTYPE *Proto, uint16_t Dimension) { | |
| 1663 return (Proto->Mean[Dimension]); | |
| 1664 } // Mean | |
| 1665 | |
| 1666 /** | |
| 1667 * This routine returns the standard deviation of the | |
| 1668 * prototype in the indicated dimension. | |
| 1669 * @param Proto prototype to return standard deviation of | |
| 1670 * @param Dimension dimension whose stddev is to be returned | |
| 1671 * @return Standard deviation of Prototype in Dimension | |
| 1672 */ | |
| 1673 float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) { | |
| 1674 switch (Proto->Style) { | |
| 1675 case spherical: | |
| 1676 return std::sqrt(Proto->Variance.Spherical); | |
| 1677 case elliptical: | |
| 1678 return std::sqrt(Proto->Variance.Elliptical[Dimension]); | |
| 1679 case mixed: | |
| 1680 switch (Proto->Distrib[Dimension]) { | |
| 1681 case normal: | |
| 1682 return std::sqrt(Proto->Variance.Elliptical[Dimension]); | |
| 1683 case uniform: | |
| 1684 case D_random: | |
| 1685 return Proto->Variance.Elliptical[Dimension]; | |
| 1686 case DISTRIBUTION_COUNT: | |
| 1687 ASSERT_HOST(!"Distribution count not allowed!"); | |
| 1688 } | |
| 1689 } | |
| 1690 return 0.0f; | |
| 1691 } // StandardDeviation | |
| 1692 | |
| 1693 /*--------------------------------------------------------------------------- | |
| 1694 Private Code | |
| 1695 ----------------------------------------------------------------------------*/ | |
| 1696 /** | |
| 1697 * This routine performs a bottoms-up clustering on the samples | |
| 1698 * held in the kd-tree of the Clusterer data structure. The | |
| 1699 * result is a cluster tree. Each node in the tree represents | |
| 1700 * a cluster which conceptually contains a subset of the samples. | |
| 1701 * More precisely, the cluster contains all of the samples which | |
| 1702 * are contained in its two sub-clusters. The leaves of the | |
| 1703 * tree are the individual samples themselves; they have no | |
| 1704 * sub-clusters. The root node of the tree conceptually contains | |
| 1705 * all of the samples. | |
| 1706 * The Clusterer data structure is changed. | |
| 1707 * @param Clusterer data structure holdings samples to be clustered | |
| 1708 */ | |
| 1709 static void CreateClusterTree(CLUSTERER *Clusterer) { | |
| 1710 ClusteringContext context; | |
| 1711 ClusterPair HeapEntry; | |
| 1712 | |
| 1713 // each sample and its nearest neighbor form a "potential" cluster | |
| 1714 // save these in a heap with the "best" potential clusters on top | |
| 1715 context.tree = Clusterer->KDTree; | |
| 1716 context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples]; | |
| 1717 context.next = 0; | |
| 1718 context.heap = new ClusterHeap(Clusterer->NumberOfSamples); | |
| 1719 KDWalk(context.tree, MakePotentialClusters, &context); | |
| 1720 | |
| 1721 // form potential clusters into actual clusters - always do "best" first | |
| 1722 while (context.heap->Pop(&HeapEntry)) { | |
| 1723 TEMPCLUSTER *PotentialCluster = HeapEntry.data(); | |
| 1724 | |
| 1725 // if main cluster of potential cluster is already in another cluster | |
| 1726 // then we don't need to worry about it | |
| 1727 if (PotentialCluster->Cluster->Clustered) { | |
| 1728 continue; | |
| 1729 } | |
| 1730 | |
| 1731 // if main cluster is not yet clustered, but its nearest neighbor is | |
| 1732 // then we must find a new nearest neighbor | |
| 1733 else if (PotentialCluster->Neighbor->Clustered) { | |
| 1734 PotentialCluster->Neighbor = | |
| 1735 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key()); | |
| 1736 if (PotentialCluster->Neighbor != nullptr) { | |
| 1737 context.heap->Push(&HeapEntry); | |
| 1738 } | |
| 1739 } | |
| 1740 | |
| 1741 // if neither cluster is already clustered, form permanent cluster | |
| 1742 else { | |
| 1743 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster); | |
| 1744 PotentialCluster->Neighbor = | |
| 1745 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key()); | |
| 1746 if (PotentialCluster->Neighbor != nullptr) { | |
| 1747 context.heap->Push(&HeapEntry); | |
| 1748 } | |
| 1749 } | |
| 1750 } | |
| 1751 | |
| 1752 // the root node in the cluster tree is now the only node in the kd-tree | |
| 1753 Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree); | |
| 1754 | |
| 1755 // free up the memory used by the K-D tree, heap, and temp clusters | |
| 1756 delete context.tree; | |
| 1757 Clusterer->KDTree = nullptr; | |
| 1758 delete context.heap; | |
| 1759 delete[] context.candidates; | |
| 1760 } // CreateClusterTree | |
| 1761 | |
| 1762 /** | |
| 1763 * This routine is designed to be used in concert with the | |
| 1764 * KDWalk routine. It will create a potential cluster for | |
| 1765 * each sample in the kd-tree that is being walked. This | |
| 1766 * potential cluster will then be pushed on the heap. | |
| 1767 * @param context ClusteringContext (see definition above) | |
| 1768 * @param Cluster current cluster being visited in kd-tree walk | |
| 1769 * @param Level level of this cluster in the kd-tree | |
| 1770 */ | |
| 1771 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) { | |
| 1772 ClusterPair HeapEntry; | |
| 1773 int next = context->next; | |
| 1774 context->candidates[next].Cluster = Cluster; | |
| 1775 HeapEntry.data() = &(context->candidates[next]); | |
| 1776 context->candidates[next].Neighbor = | |
| 1777 FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key()); | |
| 1778 if (context->candidates[next].Neighbor != nullptr) { | |
| 1779 context->heap->Push(&HeapEntry); | |
| 1780 context->next++; | |
| 1781 } | |
| 1782 } // MakePotentialClusters | |
| 1783 | |
| 1784 /** | |
| 1785 * This routine searches the specified kd-tree for the nearest | |
| 1786 * neighbor of the specified cluster. It actually uses the | |
| 1787 * kd routines to find the 2 nearest neighbors since one of them | |
| 1788 * will be the original cluster. A pointer to the nearest | |
| 1789 * neighbor is returned, if it can be found, otherwise nullptr is | |
| 1790 * returned. The distance between the 2 nodes is placed | |
| 1791 * in the specified variable. | |
| 1792 * @param Tree kd-tree to search in for nearest neighbor | |
| 1793 * @param Cluster cluster whose nearest neighbor is to be found | |
| 1794 * @param Distance ptr to variable to report distance found | |
| 1795 * @return Pointer to the nearest neighbor of Cluster, or nullptr | |
| 1796 */ | |
| 1797 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance) | |
| 1798 #define MAXNEIGHBORS 2 | |
| 1799 #define MAXDISTANCE FLT_MAX | |
| 1800 { | |
| 1801 CLUSTER *Neighbor[MAXNEIGHBORS]; | |
| 1802 float Dist[MAXNEIGHBORS]; | |
| 1803 int NumberOfNeighbors; | |
| 1804 int32_t i; | |
| 1805 CLUSTER *BestNeighbor; | |
| 1806 | |
| 1807 // find the 2 nearest neighbors of the cluster | |
| 1808 KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors, | |
| 1809 reinterpret_cast<void **>(Neighbor), Dist); | |
| 1810 | |
| 1811 // search for the nearest neighbor that is not the cluster itself | |
| 1812 *Distance = MAXDISTANCE; | |
| 1813 BestNeighbor = nullptr; | |
| 1814 for (i = 0; i < NumberOfNeighbors; i++) { | |
| 1815 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { | |
| 1816 *Distance = Dist[i]; | |
| 1817 BestNeighbor = Neighbor[i]; | |
| 1818 } | |
| 1819 } | |
| 1820 return BestNeighbor; | |
| 1821 } // FindNearestNeighbor | |
| 1822 | |
| 1823 /** | |
| 1824 * This routine creates a new permanent cluster from the | |
| 1825 * clusters specified in TempCluster. The 2 clusters in | |
| 1826 * TempCluster are marked as "clustered" and deleted from | |
| 1827 * the kd-tree. The new cluster is then added to the kd-tree. | |
| 1828 * @param Clusterer current clustering environment | |
| 1829 * @param TempCluster potential cluster to make permanent | |
| 1830 * @return Pointer to the new permanent cluster | |
| 1831 */ | |
| 1832 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { | |
| 1833 // allocate the new cluster and initialize it | |
| 1834 auto Cluster = new CLUSTER(Clusterer->SampleSize); | |
| 1835 Cluster->Clustered = false; | |
| 1836 Cluster->Prototype = false; | |
| 1837 Cluster->Left = TempCluster->Cluster; | |
| 1838 Cluster->Right = TempCluster->Neighbor; | |
| 1839 Cluster->CharID = -1; | |
| 1840 | |
| 1841 // mark the old clusters as "clustered" and delete them from the kd-tree | |
| 1842 Cluster->Left->Clustered = true; | |
| 1843 Cluster->Right->Clustered = true; | |
| 1844 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left); | |
| 1845 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right); | |
| 1846 | |
| 1847 // compute the mean and sample count for the new cluster | |
| 1848 Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, | |
| 1849 Cluster->Left->SampleCount, Cluster->Right->SampleCount, | |
| 1850 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]); | |
| 1851 | |
| 1852 // add the new cluster to the KD tree | |
| 1853 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster); | |
| 1854 return Cluster; | |
| 1855 } // MakeNewCluster | |
| 1856 | |
| 1857 /** | |
| 1858 * This routine merges two clusters into one larger cluster. | |
| 1859 * To do this it computes the number of samples in the new | |
| 1860 * cluster and the mean of the new cluster. The ParamDesc | |
| 1861 * information is used to ensure that circular dimensions | |
| 1862 * are handled correctly. | |
| 1863 * @param N # of dimensions (size of arrays) | |
| 1864 * @param ParamDesc array of dimension descriptions | |
| 1865 * @param n1, n2 number of samples in each old cluster | |
| 1866 * @param m array to hold mean of new cluster | |
| 1867 * @param m1, m2 arrays containing means of old clusters | |
| 1868 * @return The number of samples in the new cluster. | |
| 1869 */ | |
| 1870 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], | |
| 1871 float m1[], float m2[]) { | |
| 1872 int32_t i, n; | |
| 1873 | |
| 1874 n = n1 + n2; | |
| 1875 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { | |
| 1876 if (ParamDesc->Circular) { | |
| 1877 // if distance between means is greater than allowed | |
| 1878 // reduce upper point by one "rotation" to compute mean | |
| 1879 // then normalize the mean back into the accepted range | |
| 1880 if ((*m2 - *m1) > ParamDesc->HalfRange) { | |
| 1881 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; | |
| 1882 if (*m < ParamDesc->Min) { | |
| 1883 *m += ParamDesc->Range; | |
| 1884 } | |
| 1885 } else if ((*m1 - *m2) > ParamDesc->HalfRange) { | |
| 1886 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; | |
| 1887 if (*m < ParamDesc->Min) { | |
| 1888 *m += ParamDesc->Range; | |
| 1889 } | |
| 1890 } else { | |
| 1891 *m = (n1 * *m1 + n2 * *m2) / n; | |
| 1892 } | |
| 1893 } else { | |
| 1894 *m = (n1 * *m1 + n2 * *m2) / n; | |
| 1895 } | |
| 1896 } | |
| 1897 return n; | |
| 1898 } // MergeClusters | |
| 1899 | |
| 1900 /** | |
| 1901 * This routine decides which clusters in the cluster tree | |
| 1902 * should be represented by prototypes, forms a list of these | |
| 1903 * prototypes, and places the list in the Clusterer data | |
| 1904 * structure. | |
| 1905 * @param Clusterer data structure holding cluster tree | |
| 1906 * @param Config parameters used to control prototype generation | |
| 1907 */ | |
| 1908 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { | |
| 1909 LIST ClusterStack = NIL_LIST; | |
| 1910 CLUSTER *Cluster; | |
| 1911 PROTOTYPE *Prototype; | |
| 1912 | |
| 1913 // use a stack to keep track of clusters waiting to be processed | |
| 1914 // initially the only cluster on the stack is the root cluster | |
| 1915 if (Clusterer->Root != nullptr) { | |
| 1916 ClusterStack = push(NIL_LIST, Clusterer->Root); | |
| 1917 } | |
| 1918 | |
| 1919 // loop until we have analyzed all clusters which are potential prototypes | |
| 1920 while (ClusterStack != NIL_LIST) { | |
| 1921 // remove the next cluster to be analyzed from the stack | |
| 1922 // try to make a prototype from the cluster | |
| 1923 // if successful, put it on the proto list, else split the cluster | |
| 1924 Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node()); | |
| 1925 ClusterStack = pop(ClusterStack); | |
| 1926 Prototype = MakePrototype(Clusterer, Config, Cluster); | |
| 1927 if (Prototype != nullptr) { | |
| 1928 Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype); | |
| 1929 } else { | |
| 1930 ClusterStack = push(ClusterStack, Cluster->Right); | |
| 1931 ClusterStack = push(ClusterStack, Cluster->Left); | |
| 1932 } | |
| 1933 } | |
| 1934 } // ComputePrototypes | |
| 1935 | |
| 1936 /** | |
| 1937 * This routine attempts to create a prototype from the | |
| 1938 * specified cluster that conforms to the distribution | |
| 1939 * specified in Config. If there are too few samples in the | |
| 1940 * cluster to perform a statistical analysis, then a prototype | |
| 1941 * is generated but labelled as insignificant. If the | |
| 1942 * dimensions of the cluster are not independent, no prototype | |
| 1943 * is generated and nullptr is returned. If a prototype can be | |
| 1944 * found that matches the desired distribution then a pointer | |
| 1945 * to it is returned, otherwise nullptr is returned. | |
| 1946 * @param Clusterer data structure holding cluster tree | |
| 1947 * @param Config parameters used to control prototype generation | |
| 1948 * @param Cluster cluster to be made into a prototype | |
| 1949 * @return Pointer to new prototype or nullptr | |
| 1950 */ | |
| 1951 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) { | |
| 1952 PROTOTYPE *Proto; | |
| 1953 BUCKETS *Buckets; | |
| 1954 | |
| 1955 // filter out clusters which contain samples from the same character | |
| 1956 if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) { | |
| 1957 return nullptr; | |
| 1958 } | |
| 1959 | |
| 1960 // compute the covariance matrix and ranges for the cluster | |
| 1961 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); | |
| 1962 | |
| 1963 // check for degenerate clusters which need not be analyzed further | |
| 1964 // note that the MinSamples test assumes that all clusters with multiple | |
| 1965 // character samples have been removed (as above) | |
| 1966 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, | |
| 1967 static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar)); | |
| 1968 if (Proto != nullptr) { | |
| 1969 delete Statistics; | |
| 1970 return Proto; | |
| 1971 } | |
| 1972 // check to ensure that all dimensions are independent | |
| 1973 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0], | |
| 1974 Config->Independence)) { | |
| 1975 delete Statistics; | |
| 1976 return nullptr; | |
| 1977 } | |
| 1978 | |
| 1979 if (HOTELLING && Config->ProtoStyle == elliptical) { | |
| 1980 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); | |
| 1981 if (Proto != nullptr) { | |
| 1982 delete Statistics; | |
| 1983 return Proto; | |
| 1984 } | |
| 1985 } | |
| 1986 | |
| 1987 // create a histogram data structure used to evaluate distributions | |
| 1988 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence); | |
| 1989 | |
| 1990 // create a prototype based on the statistics and test it | |
| 1991 switch (Config->ProtoStyle) { | |
| 1992 case spherical: | |
| 1993 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); | |
| 1994 break; | |
| 1995 case elliptical: | |
| 1996 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); | |
| 1997 break; | |
| 1998 case mixed: | |
| 1999 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence); | |
| 2000 break; | |
| 2001 case automatic: | |
| 2002 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); | |
| 2003 if (Proto != nullptr) { | |
| 2004 break; | |
| 2005 } | |
| 2006 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); | |
| 2007 if (Proto != nullptr) { | |
| 2008 break; | |
| 2009 } | |
| 2010 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence); | |
| 2011 break; | |
| 2012 } | |
| 2013 delete Statistics; | |
| 2014 return Proto; | |
| 2015 } // MakePrototype | |
| 2016 | |
| 2017 /** | |
| 2018 * This routine checks for clusters which are degenerate and | |
| 2019 * therefore cannot be analyzed in a statistically valid way. | |
| 2020 * A cluster is defined as degenerate if it does not have at | |
| 2021 * least MINSAMPLESNEEDED samples in it. If the cluster is | |
| 2022 * found to be degenerate, a prototype of the specified style | |
| 2023 * is generated and marked as insignificant. A cluster is | |
| 2024 * also degenerate if it does not have at least MinSamples | |
| 2025 * samples in it. | |
| 2026 * | |
| 2027 * If the cluster is not degenerate, nullptr is returned. | |
| 2028 * | |
| 2029 * @param N number of dimensions | |
| 2030 * @param Cluster cluster being analyzed | |
| 2031 * @param Statistics statistical info about cluster | |
| 2032 * @param Style type of prototype to be generated | |
| 2033 * @param MinSamples minimum number of samples in a cluster | |
| 2034 * @return Pointer to degenerate prototype or nullptr. | |
| 2035 */ | |
| 2036 static PROTOTYPE *MakeDegenerateProto( // this was MinSample | |
| 2037 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) { | |
| 2038 PROTOTYPE *Proto = nullptr; | |
| 2039 | |
| 2040 if (MinSamples < MINSAMPLESNEEDED) { | |
| 2041 MinSamples = MINSAMPLESNEEDED; | |
| 2042 } | |
| 2043 | |
| 2044 if (Cluster->SampleCount < MinSamples) { | |
| 2045 switch (Style) { | |
| 2046 case spherical: | |
| 2047 Proto = NewSphericalProto(N, Cluster, Statistics); | |
| 2048 break; | |
| 2049 case elliptical: | |
| 2050 case automatic: | |
| 2051 Proto = NewEllipticalProto(N, Cluster, Statistics); | |
| 2052 break; | |
| 2053 case mixed: | |
| 2054 Proto = NewMixedProto(N, Cluster, Statistics); | |
| 2055 break; | |
| 2056 } | |
| 2057 Proto->Significant = false; | |
| 2058 } | |
| 2059 return (Proto); | |
| 2060 } // MakeDegenerateProto | |
| 2061 | |
| 2062 /** | |
| 2063 * This routine tests the specified cluster to see if ** | |
| 2064 * there is a statistically significant difference between | |
| 2065 * the sub-clusters that would be made if the cluster were to | |
| 2066 * be split. If not, then a new prototype is formed and | |
| 2067 * returned to the caller. If there is, then nullptr is returned | |
| 2068 * to the caller. | |
| 2069 * @param Clusterer data struct containing samples being clustered | |
| 2070 * @param Config provides the magic number of samples that make a good cluster | |
| 2071 * @param Cluster cluster to be made into an elliptical prototype | |
| 2072 * @param Statistics statistical info about cluster | |
| 2073 * @return Pointer to new elliptical prototype or nullptr. | |
| 2074 */ | |
| 2075 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, | |
| 2076 STATISTICS *Statistics) { | |
| 2077 // Fraction of the number of samples used as a range around 1 within | |
| 2078 // which a cluster has the magic size that allows a boost to the | |
| 2079 // FTable by kFTableBoostMargin, thus allowing clusters near the | |
| 2080 // magic size (equal to the number of sample characters) to be more | |
| 2081 // likely to stay together. | |
| 2082 const double kMagicSampleMargin = 0.0625; | |
| 2083 const double kFTableBoostMargin = 2.0; | |
| 2084 | |
| 2085 int N = Clusterer->SampleSize; | |
| 2086 CLUSTER *Left = Cluster->Left; | |
| 2087 CLUSTER *Right = Cluster->Right; | |
| 2088 if (Left == nullptr || Right == nullptr) { | |
| 2089 return nullptr; | |
| 2090 } | |
| 2091 int TotalDims = Left->SampleCount + Right->SampleCount; | |
| 2092 if (TotalDims < N + 1 || TotalDims < 2) { | |
| 2093 return nullptr; | |
| 2094 } | |
| 2095 std::vector<float> Covariance(static_cast<size_t>(N) * N); | |
| 2096 std::vector<float> Inverse(static_cast<size_t>(N) * N); | |
| 2097 std::vector<float> Delta(N); | |
| 2098 // Compute a new covariance matrix that only uses essential features. | |
| 2099 for (int i = 0; i < N; ++i) { | |
| 2100 int row_offset = i * N; | |
| 2101 if (!Clusterer->ParamDesc[i].NonEssential) { | |
| 2102 for (int j = 0; j < N; ++j) { | |
| 2103 if (!Clusterer->ParamDesc[j].NonEssential) { | |
| 2104 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; | |
| 2105 } else { | |
| 2106 Covariance[j + row_offset] = 0.0f; | |
| 2107 } | |
| 2108 } | |
| 2109 } else { | |
| 2110 for (int j = 0; j < N; ++j) { | |
| 2111 if (i == j) { | |
| 2112 Covariance[j + row_offset] = 1.0f; | |
| 2113 } else { | |
| 2114 Covariance[j + row_offset] = 0.0f; | |
| 2115 } | |
| 2116 } | |
| 2117 } | |
| 2118 } | |
| 2119 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]); | |
| 2120 if (err > 1) { | |
| 2121 tprintf("Clustering error: Matrix inverse failed with error %g\n", err); | |
| 2122 } | |
| 2123 int EssentialN = 0; | |
| 2124 for (int dim = 0; dim < N; ++dim) { | |
| 2125 if (!Clusterer->ParamDesc[dim].NonEssential) { | |
| 2126 Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; | |
| 2127 ++EssentialN; | |
| 2128 } else { | |
| 2129 Delta[dim] = 0.0f; | |
| 2130 } | |
| 2131 } | |
| 2132 // Compute Hotelling's T-squared. | |
| 2133 double Tsq = 0.0; | |
| 2134 for (int x = 0; x < N; ++x) { | |
| 2135 double temp = 0.0; | |
| 2136 for (int y = 0; y < N; ++y) { | |
| 2137 temp += static_cast<double>(Inverse[y + N * x]) * Delta[y]; | |
| 2138 } | |
| 2139 Tsq += Delta[x] * temp; | |
| 2140 } | |
| 2141 // Changed this function to match the formula in | |
| 2142 // Statistical Methods in Medical Research p 473 | |
| 2143 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. | |
| 2144 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; | |
| 2145 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN); | |
| 2146 int Fx = EssentialN; | |
| 2147 if (Fx > FTABLE_X) { | |
| 2148 Fx = FTABLE_X; | |
| 2149 } | |
| 2150 --Fx; | |
| 2151 int Fy = TotalDims - EssentialN - 1; | |
| 2152 if (Fy > FTABLE_Y) { | |
| 2153 Fy = FTABLE_Y; | |
| 2154 } | |
| 2155 --Fy; | |
| 2156 double FTarget = FTable[Fy][Fx]; | |
| 2157 if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && | |
| 2158 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { | |
| 2159 // Give magic-sized clusters a magic FTable boost. | |
| 2160 FTarget += kFTableBoostMargin; | |
| 2161 } | |
| 2162 if (F < FTarget) { | |
| 2163 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics); | |
| 2164 } | |
| 2165 return nullptr; | |
| 2166 } | |
| 2167 | |
| 2168 /** | |
| 2169 * This routine tests the specified cluster to see if it can | |
| 2170 * be approximated by a spherical normal distribution. If it | |
| 2171 * can be, then a new prototype is formed and returned to the | |
| 2172 * caller. If it can't be, then nullptr is returned to the caller. | |
| 2173 * @param Clusterer data struct containing samples being clustered | |
| 2174 * @param Cluster cluster to be made into a spherical prototype | |
| 2175 * @param Statistics statistical info about cluster | |
| 2176 * @param Buckets histogram struct used to analyze distribution | |
| 2177 * @return Pointer to new spherical prototype or nullptr. | |
| 2178 */ | |
| 2179 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, | |
| 2180 BUCKETS *Buckets) { | |
| 2181 PROTOTYPE *Proto = nullptr; | |
| 2182 int i; | |
| 2183 | |
| 2184 // check that each dimension is a normal distribution | |
| 2185 for (i = 0; i < Clusterer->SampleSize; i++) { | |
| 2186 if (Clusterer->ParamDesc[i].NonEssential) { | |
| 2187 continue; | |
| 2188 } | |
| 2189 | |
| 2190 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i], | |
| 2191 sqrt(static_cast<double>(Statistics->AvgVariance))); | |
| 2192 if (!DistributionOK(Buckets)) { | |
| 2193 break; | |
| 2194 } | |
| 2195 } | |
| 2196 // if all dimensions matched a normal distribution, make a proto | |
| 2197 if (i >= Clusterer->SampleSize) { | |
| 2198 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics); | |
| 2199 } | |
| 2200 return (Proto); | |
| 2201 } // MakeSphericalProto | |
| 2202 | |
| 2203 /** | |
| 2204 * This routine tests the specified cluster to see if it can | |
| 2205 * be approximated by an elliptical normal distribution. If it | |
| 2206 * can be, then a new prototype is formed and returned to the | |
| 2207 * caller. If it can't be, then nullptr is returned to the caller. | |
| 2208 * @param Clusterer data struct containing samples being clustered | |
| 2209 * @param Cluster cluster to be made into an elliptical prototype | |
| 2210 * @param Statistics statistical info about cluster | |
| 2211 * @param Buckets histogram struct used to analyze distribution | |
| 2212 * @return Pointer to new elliptical prototype or nullptr. | |
| 2213 */ | |
| 2214 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, | |
| 2215 STATISTICS *Statistics, BUCKETS *Buckets) { | |
| 2216 PROTOTYPE *Proto = nullptr; | |
| 2217 int i; | |
| 2218 | |
| 2219 // check that each dimension is a normal distribution | |
| 2220 for (i = 0; i < Clusterer->SampleSize; i++) { | |
| 2221 if (Clusterer->ParamDesc[i].NonEssential) { | |
| 2222 continue; | |
| 2223 } | |
| 2224 | |
| 2225 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i], | |
| 2226 sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)]))); | |
| 2227 if (!DistributionOK(Buckets)) { | |
| 2228 break; | |
| 2229 } | |
| 2230 } | |
| 2231 // if all dimensions matched a normal distribution, make a proto | |
| 2232 if (i >= Clusterer->SampleSize) { | |
| 2233 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics); | |
| 2234 } | |
| 2235 return (Proto); | |
| 2236 } // MakeEllipticalProto | |
| 2237 | |
| 2238 /** | |
| 2239 * This routine tests each dimension of the specified cluster to | |
| 2240 * see what distribution would best approximate that dimension. | |
| 2241 * Each dimension is compared to the following distributions | |
| 2242 * in order: normal, random, uniform. If each dimension can | |
| 2243 * be represented by one of these distributions, | |
| 2244 * then a new prototype is formed and returned to the | |
| 2245 * caller. If it can't be, then nullptr is returned to the caller. | |
| 2246 * @param Clusterer data struct containing samples being clustered | |
| 2247 * @param Cluster cluster to be made into a prototype | |
| 2248 * @param Statistics statistical info about cluster | |
| 2249 * @param NormalBuckets histogram struct used to analyze distribution | |
| 2250 * @param Confidence confidence level for alternate distributions | |
| 2251 * @return Pointer to new mixed prototype or nullptr. | |
| 2252 */ | |
| 2253 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, | |
| 2254 BUCKETS *NormalBuckets, double Confidence) { | |
| 2255 PROTOTYPE *Proto; | |
| 2256 int i; | |
| 2257 BUCKETS *UniformBuckets = nullptr; | |
| 2258 BUCKETS *RandomBuckets = nullptr; | |
| 2259 | |
| 2260 // create a mixed proto to work on - initially assume all dimensions normal | |
| 2261 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics); | |
| 2262 | |
| 2263 // find the proper distribution for each dimension | |
| 2264 for (i = 0; i < Clusterer->SampleSize; i++) { | |
| 2265 if (Clusterer->ParamDesc[i].NonEssential) { | |
| 2266 continue; | |
| 2267 } | |
| 2268 | |
| 2269 FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], | |
| 2270 std::sqrt(Proto->Variance.Elliptical[i])); | |
| 2271 if (DistributionOK(NormalBuckets)) { | |
| 2272 continue; | |
| 2273 } | |
| 2274 | |
| 2275 if (RandomBuckets == nullptr) { | |
| 2276 RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence); | |
| 2277 } | |
| 2278 MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i])); | |
| 2279 FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], | |
| 2280 Proto->Variance.Elliptical[i]); | |
| 2281 if (DistributionOK(RandomBuckets)) { | |
| 2282 continue; | |
| 2283 } | |
| 2284 | |
| 2285 if (UniformBuckets == nullptr) { | |
| 2286 UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence); | |
| 2287 } | |
| 2288 MakeDimUniform(i, Proto, Statistics); | |
| 2289 FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], | |
| 2290 Proto->Variance.Elliptical[i]); | |
| 2291 if (DistributionOK(UniformBuckets)) { | |
| 2292 continue; | |
| 2293 } | |
| 2294 break; | |
| 2295 } | |
| 2296 // if any dimension failed to match a distribution, discard the proto | |
| 2297 if (i < Clusterer->SampleSize) { | |
| 2298 FreePrototype(Proto); | |
| 2299 Proto = nullptr; | |
| 2300 } | |
| 2301 return (Proto); | |
| 2302 } // MakeMixedProto | |
| 2303 | |
| 2304 /** | |
| 2305 * This routine alters the ith dimension of the specified | |
| 2306 * mixed prototype to be D_random. | |
| 2307 * @param i index of dimension to be changed | |
| 2308 * @param Proto prototype whose dimension is to be altered | |
| 2309 * @param ParamDesc description of specified dimension | |
| 2310 */ | |
| 2311 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { | |
| 2312 Proto->Distrib[i] = D_random; | |
| 2313 Proto->Mean[i] = ParamDesc->MidRange; | |
| 2314 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; | |
| 2315 | |
| 2316 // subtract out the previous magnitude of this dimension from the total | |
| 2317 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; | |
| 2318 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; | |
| 2319 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; | |
| 2320 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); | |
| 2321 | |
| 2322 // note that the proto Weight is irrelevant for D_random protos | |
| 2323 } // MakeDimRandom | |
| 2324 | |
| 2325 /** | |
| 2326 * This routine alters the ith dimension of the specified | |
| 2327 * mixed prototype to be uniform. | |
| 2328 * @param i index of dimension to be changed | |
| 2329 * @param Proto prototype whose dimension is to be altered | |
| 2330 * @param Statistics statistical info about prototype | |
| 2331 */ | |
| 2332 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) { | |
| 2333 Proto->Distrib[i] = uniform; | |
| 2334 Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2; | |
| 2335 Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2; | |
| 2336 if (Proto->Variance.Elliptical[i] < MINVARIANCE) { | |
| 2337 Proto->Variance.Elliptical[i] = MINVARIANCE; | |
| 2338 } | |
| 2339 | |
| 2340 // subtract out the previous magnitude of this dimension from the total | |
| 2341 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; | |
| 2342 Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]); | |
| 2343 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; | |
| 2344 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); | |
| 2345 | |
| 2346 // note that the proto Weight is irrelevant for uniform protos | |
| 2347 } // MakeDimUniform | |
| 2348 | |
| 2349 /** | |
| 2350 * This routine searches the cluster tree for all leaf nodes | |
| 2351 * which are samples in the specified cluster. It computes | |
| 2352 * a full covariance matrix for these samples as well as | |
| 2353 * keeping track of the ranges (min and max) for each | |
| 2354 * dimension. A special data structure is allocated to | |
| 2355 * return this information to the caller. An incremental | |
| 2356 * algorithm for computing statistics is not used because | |
| 2357 * it will not work with circular dimensions. | |
| 2358 * @param N number of dimensions | |
| 2359 * @param ParamDesc array of dimension descriptions | |
| 2360 * @param Cluster cluster whose stats are to be computed | |
| 2361 * @return Pointer to new data structure containing statistics | |
| 2362 */ | |
| 2363 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) { | |
| 2364 int i, j; | |
| 2365 LIST SearchState; | |
| 2366 SAMPLE *Sample; | |
| 2367 uint32_t SampleCountAdjustedForBias; | |
| 2368 | |
| 2369 // allocate memory to hold the statistics results | |
| 2370 auto Statistics = new STATISTICS(N); | |
| 2371 | |
| 2372 // allocate temporary memory to hold the sample to mean distances | |
| 2373 std::vector<float> Distance(N); | |
| 2374 | |
| 2375 // find each sample in the cluster and merge it into the statistics | |
| 2376 InitSampleSearch(SearchState, Cluster); | |
| 2377 while ((Sample = NextSample(&SearchState)) != nullptr) { | |
| 2378 for (i = 0; i < N; i++) { | |
| 2379 Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; | |
| 2380 if (ParamDesc[i].Circular) { | |
| 2381 if (Distance[i] > ParamDesc[i].HalfRange) { | |
| 2382 Distance[i] -= ParamDesc[i].Range; | |
| 2383 } | |
| 2384 if (Distance[i] < -ParamDesc[i].HalfRange) { | |
| 2385 Distance[i] += ParamDesc[i].Range; | |
| 2386 } | |
| 2387 } | |
| 2388 if (Distance[i] < Statistics->Min[i]) { | |
| 2389 Statistics->Min[i] = Distance[i]; | |
| 2390 } | |
| 2391 if (Distance[i] > Statistics->Max[i]) { | |
| 2392 Statistics->Max[i] = Distance[i]; | |
| 2393 } | |
| 2394 } | |
| 2395 auto CoVariance = &Statistics->CoVariance[0]; | |
| 2396 for (i = 0; i < N; i++) { | |
| 2397 for (j = 0; j < N; j++, CoVariance++) { | |
| 2398 *CoVariance += Distance[i] * Distance[j]; | |
| 2399 } | |
| 2400 } | |
| 2401 } | |
| 2402 // normalize the variances by the total number of samples | |
| 2403 // use SampleCount-1 instead of SampleCount to get an unbiased estimate | |
| 2404 // also compute the geometic mean of the diagonal variances | |
| 2405 // ensure that clusters with only 1 sample are handled correctly | |
| 2406 if (Cluster->SampleCount > 1) { | |
| 2407 SampleCountAdjustedForBias = Cluster->SampleCount - 1; | |
| 2408 } else { | |
| 2409 SampleCountAdjustedForBias = 1; | |
| 2410 } | |
| 2411 auto CoVariance = &Statistics->CoVariance[0]; | |
| 2412 for (i = 0; i < N; i++) { | |
| 2413 for (j = 0; j < N; j++, CoVariance++) { | |
| 2414 *CoVariance /= SampleCountAdjustedForBias; | |
| 2415 if (j == i) { | |
| 2416 if (*CoVariance < MINVARIANCE) { | |
| 2417 *CoVariance = MINVARIANCE; | |
| 2418 } | |
| 2419 Statistics->AvgVariance *= *CoVariance; | |
| 2420 } | |
| 2421 } | |
| 2422 } | |
| 2423 Statistics->AvgVariance = | |
| 2424 static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N)); | |
| 2425 | |
| 2426 return Statistics; | |
| 2427 } // ComputeStatistics | |
| 2428 | |
| 2429 /** | |
| 2430 * This routine creates a spherical prototype data structure to | |
| 2431 * approximate the samples in the specified cluster. | |
| 2432 * Spherical prototypes have a single variance which is | |
| 2433 * common across all dimensions. All dimensions are normally | |
| 2434 * distributed and independent. | |
| 2435 * @param N number of dimensions | |
| 2436 * @param Cluster cluster to be made into a spherical prototype | |
| 2437 * @param Statistics statistical info about samples in cluster | |
| 2438 * @return Pointer to a new spherical prototype data structure | |
| 2439 */ | |
| 2440 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { | |
| 2441 PROTOTYPE *Proto; | |
| 2442 | |
| 2443 Proto = NewSimpleProto(N, Cluster); | |
| 2444 | |
| 2445 Proto->Variance.Spherical = Statistics->AvgVariance; | |
| 2446 if (Proto->Variance.Spherical < MINVARIANCE) { | |
| 2447 Proto->Variance.Spherical = MINVARIANCE; | |
| 2448 } | |
| 2449 | |
| 2450 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical); | |
| 2451 Proto->TotalMagnitude = static_cast<float>( | |
| 2452 pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N))); | |
| 2453 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; | |
| 2454 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); | |
| 2455 | |
| 2456 return (Proto); | |
| 2457 } // NewSphericalProto | |
| 2458 | |
| 2459 /** | |
| 2460 * This routine creates an elliptical prototype data structure to | |
| 2461 * approximate the samples in the specified cluster. | |
| 2462 * Elliptical prototypes have a variance for each dimension. | |
| 2463 * All dimensions are normally distributed and independent. | |
| 2464 * @param N number of dimensions | |
| 2465 * @param Cluster cluster to be made into an elliptical prototype | |
| 2466 * @param Statistics statistical info about samples in cluster | |
| 2467 * @return Pointer to a new elliptical prototype data structure | |
| 2468 */ | |
| 2469 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { | |
| 2470 PROTOTYPE *Proto; | |
| 2471 int i; | |
| 2472 | |
| 2473 Proto = NewSimpleProto(N, Cluster); | |
| 2474 Proto->Variance.Elliptical = new float[N]; | |
| 2475 Proto->Magnitude.Elliptical = new float[N]; | |
| 2476 Proto->Weight.Elliptical = new float[N]; | |
| 2477 | |
| 2478 auto CoVariance = &Statistics->CoVariance[0]; | |
| 2479 Proto->TotalMagnitude = 1.0; | |
| 2480 for (i = 0; i < N; i++, CoVariance += N + 1) { | |
| 2481 Proto->Variance.Elliptical[i] = *CoVariance; | |
| 2482 if (Proto->Variance.Elliptical[i] < MINVARIANCE) { | |
| 2483 Proto->Variance.Elliptical[i] = MINVARIANCE; | |
| 2484 } | |
| 2485 | |
| 2486 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]); | |
| 2487 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i]; | |
| 2488 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; | |
| 2489 } | |
| 2490 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); | |
| 2491 Proto->Style = elliptical; | |
| 2492 return (Proto); | |
| 2493 } // NewEllipticalProto | |
| 2494 | |
| 2495 /** | |
| 2496 * This routine creates a mixed prototype data structure to | |
| 2497 * approximate the samples in the specified cluster. | |
| 2498 * Mixed prototypes can have different distributions for | |
| 2499 * each dimension. All dimensions are independent. The | |
| 2500 * structure is initially filled in as though it were an | |
| 2501 * elliptical prototype. The actual distributions of the | |
| 2502 * dimensions can be altered by other routines. | |
| 2503 * @param N number of dimensions | |
| 2504 * @param Cluster cluster to be made into a mixed prototype | |
| 2505 * @param Statistics statistical info about samples in cluster | |
| 2506 * @return Pointer to a new mixed prototype data structure | |
| 2507 */ | |
| 2508 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { | |
| 2509 auto Proto = NewEllipticalProto(N, Cluster, Statistics); | |
| 2510 Proto->Distrib.clear(); | |
| 2511 Proto->Distrib.resize(N, normal); | |
| 2512 Proto->Style = mixed; | |
| 2513 return Proto; | |
| 2514 } // NewMixedProto | |
| 2515 | |
| 2516 /** | |
| 2517 * This routine allocates memory to hold a simple prototype | |
| 2518 * data structure, i.e. one without independent distributions | |
| 2519 * and variances for each dimension. | |
| 2520 * @param N number of dimensions | |
| 2521 * @param Cluster cluster to be made into a prototype | |
| 2522 * @return Pointer to new simple prototype | |
| 2523 */ | |
| 2524 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) { | |
| 2525 auto Proto = new PROTOTYPE; | |
| 2526 Proto->Mean = Cluster->Mean; | |
| 2527 Proto->Distrib.clear(); | |
| 2528 Proto->Significant = true; | |
| 2529 Proto->Merged = false; | |
| 2530 Proto->Style = spherical; | |
| 2531 Proto->NumSamples = Cluster->SampleCount; | |
| 2532 Proto->Cluster = Cluster; | |
| 2533 Proto->Cluster->Prototype = true; | |
| 2534 return Proto; | |
| 2535 } // NewSimpleProto | |
| 2536 | |
| 2537 /** | |
| 2538 * This routine returns true if the specified covariance | |
| 2539 * matrix indicates that all N dimensions are independent of | |
| 2540 * one another. One dimension is judged to be independent of | |
| 2541 * another when the magnitude of the corresponding correlation | |
| 2542 * coefficient is | |
| 2543 * less than the specified Independence factor. The | |
| 2544 * correlation coefficient is calculated as: (see Duda and | |
| 2545 * Hart, pg. 247) | |
| 2546 * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) | |
| 2547 * The covariance matrix is assumed to be symmetric (which | |
| 2548 * should always be true). | |
| 2549 * @param ParamDesc descriptions of each feature space dimension | |
| 2550 * @param N number of dimensions | |
| 2551 * @param CoVariance ptr to a covariance matrix | |
| 2552 * @param Independence max off-diagonal correlation coefficient | |
| 2553 * @return true if dimensions are independent, false otherwise | |
| 2554 */ | |
| 2555 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) { | |
| 2556 int i, j; | |
| 2557 float *VARii; // points to ith on-diagonal element | |
| 2558 float *VARjj; // points to jth on-diagonal element | |
| 2559 float CorrelationCoeff; | |
| 2560 | |
| 2561 VARii = CoVariance; | |
| 2562 for (i = 0; i < N; i++, VARii += N + 1) { | |
| 2563 if (ParamDesc[i].NonEssential) { | |
| 2564 continue; | |
| 2565 } | |
| 2566 | |
| 2567 VARjj = VARii + N + 1; | |
| 2568 CoVariance = VARii + 1; | |
| 2569 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { | |
| 2570 if (ParamDesc[j].NonEssential) { | |
| 2571 continue; | |
| 2572 } | |
| 2573 | |
| 2574 if ((*VARii == 0.0) || (*VARjj == 0.0)) { | |
| 2575 CorrelationCoeff = 0.0; | |
| 2576 } else { | |
| 2577 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj))); | |
| 2578 } | |
| 2579 if (CorrelationCoeff > Independence) { | |
| 2580 return false; | |
| 2581 } | |
| 2582 } | |
| 2583 } | |
| 2584 return true; | |
| 2585 } // Independent | |
| 2586 | |
| 2587 /** | |
| 2588 * This routine returns a histogram data structure which can | |
| 2589 * be used by other routines to place samples into histogram | |
| 2590 * buckets, and then apply a goodness of fit test to the | |
| 2591 * histogram data to determine if the samples belong to the | |
| 2592 * specified probability distribution. The routine keeps | |
| 2593 * a list of bucket data structures which have already been | |
| 2594 * created so that it minimizes the computation time needed | |
| 2595 * to create a new bucket. | |
| 2596 * @param clusterer which keeps a bucket_cache for us. | |
| 2597 * @param Distribution type of probability distribution to test for | |
| 2598 * @param SampleCount number of samples that are available | |
| 2599 * @param Confidence probability of a Type I error | |
| 2600 * @return Bucket data structure | |
| 2601 */ | |
| 2602 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, | |
| 2603 double Confidence) { | |
| 2604 // Get an old bucket structure with the same number of buckets. | |
| 2605 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); | |
| 2606 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS]; | |
| 2607 | |
| 2608 // If a matching bucket structure is not found, make one and save it. | |
| 2609 if (Buckets == nullptr) { | |
| 2610 Buckets = MakeBuckets(Distribution, SampleCount, Confidence); | |
| 2611 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets; | |
| 2612 } else { | |
| 2613 // Just adjust the existing buckets. | |
| 2614 if (SampleCount != Buckets->SampleCount) { | |
| 2615 AdjustBuckets(Buckets, SampleCount); | |
| 2616 } | |
| 2617 if (Confidence != Buckets->Confidence) { | |
| 2618 Buckets->Confidence = Confidence; | |
| 2619 Buckets->ChiSquared = | |
| 2620 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); | |
| 2621 } | |
| 2622 InitBuckets(Buckets); | |
| 2623 } | |
| 2624 return Buckets; | |
| 2625 } // GetBuckets | |
| 2626 | |
| 2627 /** | |
| 2628 * This routine creates a histogram data structure which can | |
| 2629 * be used by other routines to place samples into histogram | |
| 2630 * buckets, and then apply a goodness of fit test to the | |
| 2631 * histogram data to determine if the samples belong to the | |
| 2632 * specified probability distribution. The buckets are | |
| 2633 * allocated in such a way that the expected frequency of | |
| 2634 * samples in each bucket is approximately the same. In | |
| 2635 * order to make this possible, a mapping table is | |
| 2636 * computed which maps "normalized" samples into the | |
| 2637 * appropriate bucket. | |
| 2638 * @param Distribution type of probability distribution to test for | |
| 2639 * @param SampleCount number of samples that are available | |
| 2640 * @param Confidence probability of a Type I error | |
| 2641 * @return Pointer to new histogram data structure | |
| 2642 */ | |
| 2643 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) { | |
| 2644 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity}; | |
| 2645 int i, j; | |
| 2646 double BucketProbability; | |
| 2647 double NextBucketBoundary; | |
| 2648 double Probability; | |
| 2649 double ProbabilityDelta; | |
| 2650 double LastProbDensity; | |
| 2651 double ProbDensity; | |
| 2652 uint16_t CurrentBucket; | |
| 2653 bool Symmetrical; | |
| 2654 | |
| 2655 // allocate memory needed for data structure | |
| 2656 auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount)); | |
| 2657 Buckets->SampleCount = SampleCount; | |
| 2658 Buckets->Confidence = Confidence; | |
| 2659 | |
| 2660 // initialize simple fields | |
| 2661 Buckets->Distribution = Distribution; | |
| 2662 | |
| 2663 // all currently defined distributions are symmetrical | |
| 2664 Symmetrical = true; | |
| 2665 Buckets->ChiSquared = | |
| 2666 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); | |
| 2667 | |
| 2668 if (Symmetrical) { | |
| 2669 // allocate buckets so that all have approx. equal probability | |
| 2670 BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets); | |
| 2671 | |
| 2672 // distribution is symmetric so fill in upper half then copy | |
| 2673 CurrentBucket = Buckets->NumberOfBuckets / 2; | |
| 2674 if (Odd(Buckets->NumberOfBuckets)) { | |
| 2675 NextBucketBoundary = BucketProbability / 2; | |
| 2676 } else { | |
| 2677 NextBucketBoundary = BucketProbability; | |
| 2678 } | |
| 2679 | |
| 2680 Probability = 0.0; | |
| 2681 LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2); | |
| 2682 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { | |
| 2683 ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1); | |
| 2684 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0); | |
| 2685 Probability += ProbabilityDelta; | |
| 2686 if (Probability > NextBucketBoundary) { | |
| 2687 if (CurrentBucket < Buckets->NumberOfBuckets - 1) { | |
| 2688 CurrentBucket++; | |
| 2689 } | |
| 2690 NextBucketBoundary += BucketProbability; | |
| 2691 } | |
| 2692 Buckets->Bucket[i] = CurrentBucket; | |
| 2693 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount); | |
| 2694 LastProbDensity = ProbDensity; | |
| 2695 } | |
| 2696 // place any leftover probability into the last bucket | |
| 2697 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount); | |
| 2698 | |
| 2699 // copy upper half of distribution to lower half | |
| 2700 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) { | |
| 2701 Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets); | |
| 2702 } | |
| 2703 | |
| 2704 // copy upper half of expected counts to lower half | |
| 2705 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) { | |
| 2706 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; | |
| 2707 } | |
| 2708 } | |
| 2709 return Buckets; | |
| 2710 } // MakeBuckets | |
| 2711 | |
| 2712 /** | |
| 2713 * This routine computes the optimum number of histogram | |
| 2714 * buckets that should be used in a chi-squared goodness of | |
| 2715 * fit test for the specified number of samples. The optimum | |
| 2716 * number is computed based on Table 4.1 on pg. 147 of | |
| 2717 * "Measurement and Analysis of Random Data" by Bendat & Piersol. | |
| 2718 * Linear interpolation is used to interpolate between table | |
| 2719 * values. The table is intended for a 0.05 level of | |
| 2720 * significance (alpha). This routine assumes that it is | |
| 2721 * equally valid for other alpha's, which may not be true. | |
| 2722 * @param SampleCount number of samples to be tested | |
| 2723 * @return Optimum number of histogram buckets | |
| 2724 */ | |
| 2725 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) { | |
| 2726 uint8_t Last, Next; | |
| 2727 float Slope; | |
| 2728 | |
| 2729 if (SampleCount < kCountTable[0]) { | |
| 2730 return kBucketsTable[0]; | |
| 2731 } | |
| 2732 | |
| 2733 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { | |
| 2734 if (SampleCount <= kCountTable[Next]) { | |
| 2735 Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) / | |
| 2736 static_cast<float>(kCountTable[Next] - kCountTable[Last]); | |
| 2737 return ( | |
| 2738 static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last]))); | |
| 2739 } | |
| 2740 } | |
| 2741 return kBucketsTable[Last]; | |
| 2742 } // OptimumNumberOfBuckets | |
| 2743 | |
| 2744 /** | |
| 2745 * This routine computes the chi-squared value which will | |
| 2746 * leave a cumulative probability of Alpha in the right tail | |
| 2747 * of a chi-squared distribution with the specified number of | |
| 2748 * degrees of freedom. Alpha must be between 0 and 1. | |
| 2749 * DegreesOfFreedom must be even. The routine maintains an | |
| 2750 * array of lists. Each list corresponds to a different | |
| 2751 * number of degrees of freedom. Each entry in the list | |
| 2752 * corresponds to a different alpha value and its corresponding | |
| 2753 * chi-squared value. Therefore, once a particular chi-squared | |
| 2754 * value is computed, it is stored in the list and never | |
| 2755 * needs to be computed again. | |
| 2756 * @param DegreesOfFreedom determines shape of distribution | |
| 2757 * @param Alpha probability of right tail | |
| 2758 * @return Desired chi-squared value | |
| 2759 */ | |
| 2760 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha) | |
| 2761 #define CHIACCURACY 0.01 | |
| 2762 #define MINALPHA (1e-200) | |
| 2763 { | |
| 2764 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; | |
| 2765 | |
| 2766 // limit the minimum alpha that can be used - if alpha is too small | |
| 2767 // it may not be possible to compute chi-squared. | |
| 2768 Alpha = ClipToRange(Alpha, MINALPHA, 1.0); | |
| 2769 if (Odd(DegreesOfFreedom)) { | |
| 2770 DegreesOfFreedom++; | |
| 2771 } | |
| 2772 | |
| 2773 /* find the list of chi-squared values which have already been computed | |
| 2774 for the specified number of degrees of freedom. Search the list for | |
| 2775 the desired chi-squared. */ | |
| 2776 CHISTRUCT SearchKey(0.0, Alpha); | |
| 2777 auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch); | |
| 2778 auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr); | |
| 2779 | |
| 2780 if (OldChiSquared == nullptr) { | |
| 2781 OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha); | |
| 2782 OldChiSquared->ChiSquared = | |
| 2783 Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY); | |
| 2784 ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared); | |
| 2785 } else { | |
| 2786 // further optimization might move OldChiSquared to front of list | |
| 2787 } | |
| 2788 | |
| 2789 return (OldChiSquared->ChiSquared); | |
| 2790 | |
| 2791 } // ComputeChiSquared | |
| 2792 | |
| 2793 /** | |
| 2794 * This routine computes the probability density function | |
| 2795 * of a discrete normal distribution defined by the global | |
| 2796 * variables kNormalMean, kNormalVariance, and kNormalMagnitude. | |
| 2797 * Normal magnitude could, of course, be computed in terms of | |
| 2798 * the normal variance but it is precomputed for efficiency. | |
| 2799 * @param x number to compute the normal probability density for | |
| 2800 * @note Globals: | |
| 2801 * kNormalMean mean of a discrete normal distribution | |
| 2802 * kNormalVariance variance of a discrete normal distribution | |
| 2803 * kNormalMagnitude magnitude of a discrete normal distribution | |
| 2804 * @return The value of the normal distribution at x. | |
| 2805 */ | |
| 2806 static double NormalDensity(int32_t x) { | |
| 2807 double Distance; | |
| 2808 | |
| 2809 Distance = x - kNormalMean; | |
| 2810 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); | |
| 2811 } // NormalDensity | |
| 2812 | |
| 2813 /** | |
| 2814 * This routine computes the probability density function | |
| 2815 * of a uniform distribution at the specified point. The | |
| 2816 * range of the distribution is from 0 to BUCKETTABLESIZE. | |
| 2817 * @param x number to compute the uniform probability density for | |
| 2818 * @return The value of the uniform distribution at x. | |
| 2819 */ | |
| 2820 static double UniformDensity(int32_t x) { | |
| 2821 constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE; | |
| 2822 | |
| 2823 if ((x >= 0) && (x <= BUCKETTABLESIZE)) { | |
| 2824 return UniformDistributionDensity; | |
| 2825 } else { | |
| 2826 return 0.0; | |
| 2827 } | |
| 2828 } // UniformDensity | |
| 2829 | |
| 2830 /** | |
| 2831 * This routine computes a trapezoidal approximation to the | |
| 2832 * integral of a function over a small delta in x. | |
| 2833 * @param f1 value of function at x1 | |
| 2834 * @param f2 value of function at x2 | |
| 2835 * @param Dx x2 - x1 (should always be positive) | |
| 2836 * @return Approximation of the integral of the function from x1 to x2. | |
| 2837 */ | |
| 2838 static double Integral(double f1, double f2, double Dx) { | |
| 2839 return (f1 + f2) * Dx / 2.0; | |
| 2840 } // Integral | |
| 2841 | |
| 2842 /** | |
| 2843 * This routine counts the number of cluster samples which | |
| 2844 * fall within the various histogram buckets in Buckets. Only | |
| 2845 * one dimension of each sample is examined. The exact meaning | |
| 2846 * of the Mean and StdDev parameters depends on the | |
| 2847 * distribution which is being analyzed (this info is in the | |
| 2848 * Buckets data structure). For normal distributions, Mean | |
| 2849 * and StdDev have the expected meanings. For uniform and | |
| 2850 * random distributions the Mean is the center point of the | |
| 2851 * range and the StdDev is 1/2 the range. A dimension with | |
| 2852 * zero standard deviation cannot be statistically analyzed. | |
| 2853 * In this case, a pseudo-analysis is used. | |
| 2854 * The Buckets data structure is filled in. | |
| 2855 * @param Buckets histogram buckets to count samples | |
| 2856 * @param Cluster cluster whose samples are being analyzed | |
| 2857 * @param Dim dimension of samples which is being analyzed | |
| 2858 * @param ParamDesc description of the dimension | |
| 2859 * @param Mean "mean" of the distribution | |
| 2860 * @param StdDev "standard deviation" of the distribution | |
| 2861 */ | |
| 2862 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, | |
| 2863 float Mean, float StdDev) { | |
| 2864 uint16_t BucketID; | |
| 2865 int i; | |
| 2866 LIST SearchState; | |
| 2867 SAMPLE *Sample; | |
| 2868 | |
| 2869 // initialize the histogram bucket counts to 0 | |
| 2870 for (i = 0; i < Buckets->NumberOfBuckets; i++) { | |
| 2871 Buckets->Count[i] = 0; | |
| 2872 } | |
| 2873 | |
| 2874 if (StdDev == 0.0) { | |
| 2875 /* if the standard deviation is zero, then we can't statistically | |
| 2876 analyze the cluster. Use a pseudo-analysis: samples exactly on | |
| 2877 the mean are distributed evenly across all buckets. Samples greater | |
| 2878 than the mean are placed in the last bucket; samples less than the | |
| 2879 mean are placed in the first bucket. */ | |
| 2880 | |
| 2881 InitSampleSearch(SearchState, Cluster); | |
| 2882 i = 0; | |
| 2883 while ((Sample = NextSample(&SearchState)) != nullptr) { | |
| 2884 if (Sample->Mean[Dim] > Mean) { | |
| 2885 BucketID = Buckets->NumberOfBuckets - 1; | |
| 2886 } else if (Sample->Mean[Dim] < Mean) { | |
| 2887 BucketID = 0; | |
| 2888 } else { | |
| 2889 BucketID = i; | |
| 2890 } | |
| 2891 Buckets->Count[BucketID] += 1; | |
| 2892 i++; | |
| 2893 if (i >= Buckets->NumberOfBuckets) { | |
| 2894 i = 0; | |
| 2895 } | |
| 2896 } | |
| 2897 } else { | |
| 2898 // search for all samples in the cluster and add to histogram buckets | |
| 2899 InitSampleSearch(SearchState, Cluster); | |
| 2900 while ((Sample = NextSample(&SearchState)) != nullptr) { | |
| 2901 switch (Buckets->Distribution) { | |
| 2902 case normal: | |
| 2903 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev); | |
| 2904 break; | |
| 2905 case D_random: | |
| 2906 case uniform: | |
| 2907 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev); | |
| 2908 break; | |
| 2909 default: | |
| 2910 BucketID = 0; | |
| 2911 } | |
| 2912 Buckets->Count[Buckets->Bucket[BucketID]] += 1; | |
| 2913 } | |
| 2914 } | |
| 2915 } // FillBuckets | |
| 2916 | |
| 2917 /** | |
| 2918 * This routine determines which bucket x falls into in the | |
| 2919 * discrete normal distribution defined by kNormalMean | |
| 2920 * and kNormalStdDev. x values which exceed the range of | |
| 2921 * the discrete distribution are clipped. | |
| 2922 * @param ParamDesc used to identify circular dimensions | |
| 2923 * @param x value to be normalized | |
| 2924 * @param Mean mean of normal distribution | |
| 2925 * @param StdDev standard deviation of normal distribution | |
| 2926 * @return Bucket number into which x falls | |
| 2927 */ | |
| 2928 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) { | |
| 2929 float X; | |
| 2930 | |
| 2931 // wraparound circular parameters if necessary | |
| 2932 if (ParamDesc->Circular) { | |
| 2933 if (x - Mean > ParamDesc->HalfRange) { | |
| 2934 x -= ParamDesc->Range; | |
| 2935 } else if (x - Mean < -ParamDesc->HalfRange) { | |
| 2936 x += ParamDesc->Range; | |
| 2937 } | |
| 2938 } | |
| 2939 | |
| 2940 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean; | |
| 2941 if (X < 0) { | |
| 2942 return 0; | |
| 2943 } | |
| 2944 if (X > BUCKETTABLESIZE - 1) { | |
| 2945 return (static_cast<uint16_t>(BUCKETTABLESIZE - 1)); | |
| 2946 } | |
| 2947 return static_cast<uint16_t>(floor(static_cast<double>(X))); | |
| 2948 } // NormalBucket | |
| 2949 | |
| 2950 /** | |
| 2951 * This routine determines which bucket x falls into in the | |
| 2952 * discrete uniform distribution defined by | |
| 2953 * BUCKETTABLESIZE. x values which exceed the range of | |
| 2954 * the discrete distribution are clipped. | |
| 2955 * @param ParamDesc used to identify circular dimensions | |
| 2956 * @param x value to be normalized | |
| 2957 * @param Mean center of range of uniform distribution | |
| 2958 * @param StdDev 1/2 the range of the uniform distribution | |
| 2959 * @return Bucket number into which x falls | |
| 2960 */ | |
| 2961 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) { | |
| 2962 float X; | |
| 2963 | |
| 2964 // wraparound circular parameters if necessary | |
| 2965 if (ParamDesc->Circular) { | |
| 2966 if (x - Mean > ParamDesc->HalfRange) { | |
| 2967 x -= ParamDesc->Range; | |
| 2968 } else if (x - Mean < -ParamDesc->HalfRange) { | |
| 2969 x += ParamDesc->Range; | |
| 2970 } | |
| 2971 } | |
| 2972 | |
| 2973 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); | |
| 2974 if (X < 0) { | |
| 2975 return 0; | |
| 2976 } | |
| 2977 if (X > BUCKETTABLESIZE - 1) { | |
| 2978 return static_cast<uint16_t>(BUCKETTABLESIZE - 1); | |
| 2979 } | |
| 2980 return static_cast<uint16_t>(floor(static_cast<double>(X))); | |
| 2981 } // UniformBucket | |
| 2982 | |
| 2983 /** | |
| 2984 * This routine performs a chi-square goodness of fit test | |
| 2985 * on the histogram data in the Buckets data structure. | |
| 2986 * true is returned if the histogram matches the probability | |
| 2987 * distribution which was specified when the Buckets | |
| 2988 * structure was originally created. Otherwise false is | |
| 2989 * returned. | |
| 2990 * @param Buckets histogram data to perform chi-square test on | |
| 2991 * @return true if samples match distribution, false otherwise | |
| 2992 */ | |
| 2993 static bool DistributionOK(BUCKETS *Buckets) { | |
| 2994 float FrequencyDifference; | |
| 2995 float TotalDifference; | |
| 2996 int i; | |
| 2997 | |
| 2998 // compute how well the histogram matches the expected histogram | |
| 2999 TotalDifference = 0.0; | |
| 3000 for (i = 0; i < Buckets->NumberOfBuckets; i++) { | |
| 3001 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; | |
| 3002 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i]; | |
| 3003 } | |
| 3004 | |
| 3005 // test to see if the difference is more than expected | |
| 3006 if (TotalDifference > Buckets->ChiSquared) { | |
| 3007 return false; | |
| 3008 } else { | |
| 3009 return true; | |
| 3010 } | |
| 3011 } // DistributionOK | |
| 3012 | |
| 3013 /** | |
| 3014 * This routine computes the degrees of freedom that should | |
| 3015 * be used in a chi-squared test with the specified number of | |
| 3016 * histogram buckets. The result is always rounded up to | |
| 3017 * the next even number so that the value of chi-squared can be | |
| 3018 * computed more easily. This will cause the value of | |
| 3019 * chi-squared to be higher than the optimum value, resulting | |
| 3020 * in the chi-square test being more lenient than optimum. | |
| 3021 * @param Distribution distribution being tested for | |
| 3022 * @param HistogramBuckets number of buckets in chi-square test | |
| 3023 * @return The number of degrees of freedom for a chi-square test | |
| 3024 */ | |
| 3025 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) { | |
| 3026 static uint8_t DegreeOffsets[] = {3, 3, 1}; | |
| 3027 | |
| 3028 uint16_t AdjustedNumBuckets; | |
| 3029 | |
| 3030 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)]; | |
| 3031 if (Odd(AdjustedNumBuckets)) { | |
| 3032 AdjustedNumBuckets++; | |
| 3033 } | |
| 3034 return (AdjustedNumBuckets); | |
| 3035 | |
| 3036 } // DegreesOfFreedom | |
| 3037 | |
| 3038 /** | |
| 3039 * This routine multiplies each ExpectedCount histogram entry | |
| 3040 * by NewSampleCount/OldSampleCount so that the histogram | |
| 3041 * is now adjusted to the new sample count. | |
| 3042 * @param Buckets histogram data structure to adjust | |
| 3043 * @param NewSampleCount new sample count to adjust to | |
| 3044 */ | |
| 3045 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) { | |
| 3046 int i; | |
| 3047 double AdjustFactor; | |
| 3048 | |
| 3049 AdjustFactor = | |
| 3050 ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount))); | |
| 3051 | |
| 3052 for (i = 0; i < Buckets->NumberOfBuckets; i++) { | |
| 3053 Buckets->ExpectedCount[i] *= AdjustFactor; | |
| 3054 } | |
| 3055 | |
| 3056 Buckets->SampleCount = NewSampleCount; | |
| 3057 | |
| 3058 } // AdjustBuckets | |
| 3059 | |
| 3060 /** | |
| 3061 * This routine sets the bucket counts in the specified histogram | |
| 3062 * to zero. | |
| 3063 * @param Buckets histogram data structure to init | |
| 3064 */ | |
| 3065 static void InitBuckets(BUCKETS *Buckets) { | |
| 3066 int i; | |
| 3067 | |
| 3068 for (i = 0; i < Buckets->NumberOfBuckets; i++) { | |
| 3069 Buckets->Count[i] = 0; | |
| 3070 } | |
| 3071 | |
| 3072 } // InitBuckets | |
| 3073 | |
| 3074 /** | |
| 3075 * This routine is used to search a list of structures which | |
| 3076 * hold pre-computed chi-squared values for a chi-squared | |
| 3077 * value whose corresponding alpha field matches the alpha | |
| 3078 * field of SearchKey. | |
| 3079 * | |
| 3080 * It is called by the list search routines. | |
| 3081 * | |
| 3082 * @param arg1 chi-squared struct being tested for a match | |
| 3083 * @param arg2 chi-squared struct that is the search key | |
| 3084 * @return true if ChiStruct's Alpha matches SearchKey's Alpha | |
| 3085 */ | |
| 3086 static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, | |
| 3087 void *arg2) { // CHISTRUCT *SearchKey) | |
| 3088 auto *ChiStruct = static_cast<CHISTRUCT *>(arg1); | |
| 3089 auto *SearchKey = static_cast<CHISTRUCT *>(arg2); | |
| 3090 | |
| 3091 return (ChiStruct->Alpha == SearchKey->Alpha); | |
| 3092 | |
| 3093 } // AlphaMatch | |
| 3094 | |
| 3095 /** | |
| 3096 * This routine attempts to find an x value at which Function | |
| 3097 * goes to zero (i.e. a root of the function). It will only | |
| 3098 * work correctly if a solution actually exists and there | |
| 3099 * are no extrema between the solution and the InitialGuess. | |
| 3100 * The algorithms used are extremely primitive. | |
| 3101 * | |
| 3102 * @param Function function whose zero is to be found | |
| 3103 * @param FunctionParams arbitrary data to pass to function | |
| 3104 * @param InitialGuess point to start solution search at | |
| 3105 * @param Accuracy maximum allowed error | |
| 3106 * @return Solution of function (x for which f(x) = 0). | |
| 3107 */ | |
| 3108 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy) | |
| 3109 #define INITIALDELTA 0.1 | |
| 3110 #define DELTARATIO 0.1 | |
| 3111 { | |
| 3112 double x; | |
| 3113 double f; | |
| 3114 double Slope; | |
| 3115 double Delta; | |
| 3116 double NewDelta; | |
| 3117 double xDelta; | |
| 3118 double LastPosX, LastNegX; | |
| 3119 | |
| 3120 x = InitialGuess; | |
| 3121 Delta = INITIALDELTA; | |
| 3122 LastPosX = FLT_MAX; | |
| 3123 LastNegX = -FLT_MAX; | |
| 3124 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x); | |
| 3125 while (Abs(LastPosX - LastNegX) > Accuracy) { | |
| 3126 // keep track of outer bounds of current estimate | |
| 3127 if (f < 0) { | |
| 3128 LastNegX = x; | |
| 3129 } else { | |
| 3130 LastPosX = x; | |
| 3131 } | |
| 3132 | |
| 3133 // compute the approx. slope of f(x) at the current point | |
| 3134 Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta; | |
| 3135 | |
| 3136 // compute the next solution guess */ | |
| 3137 xDelta = f / Slope; | |
| 3138 x -= xDelta; | |
| 3139 | |
| 3140 // reduce the delta used for computing slope to be a fraction of | |
| 3141 // the amount moved to get to the new guess | |
| 3142 NewDelta = Abs(xDelta) * DELTARATIO; | |
| 3143 if (NewDelta < Delta) { | |
| 3144 Delta = NewDelta; | |
| 3145 } | |
| 3146 | |
| 3147 // compute the value of the function at the new guess | |
| 3148 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x); | |
| 3149 } | |
| 3150 return (x); | |
| 3151 | |
| 3152 } // Solve | |
| 3153 | |
| 3154 /** | |
| 3155 * This routine computes the area under a chi density curve | |
| 3156 * from 0 to x, minus the desired area under the curve. The | |
| 3157 * number of degrees of freedom of the chi curve is specified | |
| 3158 * in the ChiParams structure. The desired area is also | |
| 3159 * specified in the ChiParams structure as Alpha (or 1 minus | |
| 3160 * the desired area). This routine is intended to be passed | |
| 3161 * to the Solve() function to find the value of chi-squared | |
| 3162 * which will yield a desired area under the right tail of | |
| 3163 * the chi density curve. The function will only work for | |
| 3164 * even degrees of freedom. The equations are based on | |
| 3165 * integrating the chi density curve in parts to obtain | |
| 3166 * a series that can be used to compute the area under the | |
| 3167 * curve. | |
| 3168 * @param ChiParams contains degrees of freedom and alpha | |
| 3169 * @param x value of chi-squared to evaluate | |
| 3170 * @return Error between actual and desired area under the chi curve. | |
| 3171 */ | |
| 3172 static double ChiArea(CHISTRUCT *ChiParams, double x) { | |
| 3173 int i, N; | |
| 3174 double SeriesTotal; | |
| 3175 double Denominator; | |
| 3176 double PowerOfx; | |
| 3177 | |
| 3178 N = ChiParams->DegreesOfFreedom / 2 - 1; | |
| 3179 SeriesTotal = 1; | |
| 3180 Denominator = 1; | |
| 3181 PowerOfx = 1; | |
| 3182 for (i = 1; i <= N; i++) { | |
| 3183 Denominator *= 2 * i; | |
| 3184 PowerOfx *= x; | |
| 3185 SeriesTotal += PowerOfx / Denominator; | |
| 3186 } | |
| 3187 return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha); | |
| 3188 | |
| 3189 } // ChiArea | |
| 3190 | |
| 3191 /** | |
| 3192 * This routine looks at all samples in the specified cluster. | |
| 3193 * It computes a running estimate of the percentage of the | |
| 3194 * characters which have more than 1 sample in the cluster. | |
| 3195 * When this percentage exceeds MaxIllegal, true is returned. | |
| 3196 * Otherwise false is returned. The CharID | |
| 3197 * fields must contain integers which identify the training | |
| 3198 * characters which were used to generate the sample. One | |
| 3199 * integer is used for each sample. The NumChar field in | |
| 3200 * the Clusterer must contain the number of characters in the | |
| 3201 * training set. All CharID fields must be between 0 and | |
| 3202 * NumChar-1. The main function of this routine is to help | |
| 3203 * identify clusters which need to be split further, i.e. if | |
| 3204 * numerous training characters have 2 or more features which are | |
| 3205 * contained in the same cluster, then the cluster should be | |
| 3206 * split. | |
| 3207 * | |
| 3208 * @param Clusterer data structure holding cluster tree | |
| 3209 * @param Cluster cluster containing samples to be tested | |
| 3210 * @param MaxIllegal max percentage of samples allowed to have | |
| 3211 * more than 1 feature in the cluster | |
| 3212 * @return true if the cluster should be split, false otherwise. | |
| 3213 */ | |
| 3214 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal) | |
| 3215 #define ILLEGAL_CHAR 2 | |
| 3216 { | |
| 3217 static std::vector<uint8_t> CharFlags; | |
| 3218 LIST SearchState; | |
| 3219 SAMPLE *Sample; | |
| 3220 int32_t CharID; | |
| 3221 int32_t NumCharInCluster; | |
| 3222 int32_t NumIllegalInCluster; | |
| 3223 float PercentIllegal; | |
| 3224 | |
| 3225 // initial estimate assumes that no illegal chars exist in the cluster | |
| 3226 NumCharInCluster = Cluster->SampleCount; | |
| 3227 NumIllegalInCluster = 0; | |
| 3228 | |
| 3229 if (Clusterer->NumChar > CharFlags.size()) { | |
| 3230 CharFlags.resize(Clusterer->NumChar); | |
| 3231 } | |
| 3232 | |
| 3233 for (auto &CharFlag : CharFlags) { | |
| 3234 CharFlag = false; | |
| 3235 } | |
| 3236 | |
| 3237 // find each sample in the cluster and check if we have seen it before | |
| 3238 InitSampleSearch(SearchState, Cluster); | |
| 3239 while ((Sample = NextSample(&SearchState)) != nullptr) { | |
| 3240 CharID = Sample->CharID; | |
| 3241 if (CharFlags[CharID] == false) { | |
| 3242 CharFlags[CharID] = true; | |
| 3243 } else { | |
| 3244 if (CharFlags[CharID] == true) { | |
| 3245 NumIllegalInCluster++; | |
| 3246 CharFlags[CharID] = ILLEGAL_CHAR; | |
| 3247 } | |
| 3248 NumCharInCluster--; | |
| 3249 PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster; | |
| 3250 if (PercentIllegal > MaxIllegal) { | |
| 3251 destroy(SearchState); | |
| 3252 return true; | |
| 3253 } | |
| 3254 } | |
| 3255 } | |
| 3256 return false; | |
| 3257 | |
| 3258 } // MultipleCharSamples | |
| 3259 | |
| 3260 /** | |
| 3261 * Compute the inverse of a matrix using LU decomposition with partial pivoting. | |
| 3262 * The return value is the sum of norms of the off-diagonal terms of the | |
| 3263 * product of a and inv. (A measure of the error.) | |
| 3264 */ | |
| 3265 static double InvertMatrix(const float *input, int size, float *inv) { | |
| 3266 // Allocate memory for the 2D arrays. | |
| 3267 GENERIC_2D_ARRAY<double> U(size, size, 0.0); | |
| 3268 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0); | |
| 3269 GENERIC_2D_ARRAY<double> L(size, size, 0.0); | |
| 3270 | |
| 3271 // Initialize the working matrices. U starts as input, L as I and U_inv as O. | |
| 3272 int row; | |
| 3273 int col; | |
| 3274 for (row = 0; row < size; row++) { | |
| 3275 for (col = 0; col < size; col++) { | |
| 3276 U[row][col] = input[row * size + col]; | |
| 3277 L[row][col] = row == col ? 1.0 : 0.0; | |
| 3278 U_inv[row][col] = 0.0; | |
| 3279 } | |
| 3280 } | |
| 3281 | |
| 3282 // Compute forward matrix by inversion by LU decomposition of input. | |
| 3283 for (col = 0; col < size; ++col) { | |
| 3284 // Find best pivot | |
| 3285 int best_row = 0; | |
| 3286 double best_pivot = -1.0; | |
| 3287 for (row = col; row < size; ++row) { | |
| 3288 if (Abs(U[row][col]) > best_pivot) { | |
| 3289 best_pivot = Abs(U[row][col]); | |
| 3290 best_row = row; | |
| 3291 } | |
| 3292 } | |
| 3293 // Exchange pivot rows. | |
| 3294 if (best_row != col) { | |
| 3295 for (int k = 0; k < size; ++k) { | |
| 3296 double tmp = U[best_row][k]; | |
| 3297 U[best_row][k] = U[col][k]; | |
| 3298 U[col][k] = tmp; | |
| 3299 tmp = L[best_row][k]; | |
| 3300 L[best_row][k] = L[col][k]; | |
| 3301 L[col][k] = tmp; | |
| 3302 } | |
| 3303 } | |
| 3304 // Now do the pivot itself. | |
| 3305 for (row = col + 1; row < size; ++row) { | |
| 3306 double ratio = -U[row][col] / U[col][col]; | |
| 3307 for (int j = col; j < size; ++j) { | |
| 3308 U[row][j] += U[col][j] * ratio; | |
| 3309 } | |
| 3310 for (int k = 0; k < size; ++k) { | |
| 3311 L[row][k] += L[col][k] * ratio; | |
| 3312 } | |
| 3313 } | |
| 3314 } | |
| 3315 // Next invert U. | |
| 3316 for (col = 0; col < size; ++col) { | |
| 3317 U_inv[col][col] = 1.0 / U[col][col]; | |
| 3318 for (row = col - 1; row >= 0; --row) { | |
| 3319 double total = 0.0; | |
| 3320 for (int k = col; k > row; --k) { | |
| 3321 total += U[row][k] * U_inv[k][col]; | |
| 3322 } | |
| 3323 U_inv[row][col] = -total / U[row][row]; | |
| 3324 } | |
| 3325 } | |
| 3326 // Now the answer is U_inv.L. | |
| 3327 for (row = 0; row < size; row++) { | |
| 3328 for (col = 0; col < size; col++) { | |
| 3329 double sum = 0.0; | |
| 3330 for (int k = row; k < size; ++k) { | |
| 3331 sum += U_inv[row][k] * L[k][col]; | |
| 3332 } | |
| 3333 inv[row * size + col] = sum; | |
| 3334 } | |
| 3335 } | |
| 3336 // Check matrix product. | |
| 3337 double error_sum = 0.0; | |
| 3338 for (row = 0; row < size; row++) { | |
| 3339 for (col = 0; col < size; col++) { | |
| 3340 double sum = 0.0; | |
| 3341 for (int k = 0; k < size; ++k) { | |
| 3342 sum += static_cast<double>(input[row * size + k]) * inv[k * size + col]; | |
| 3343 } | |
| 3344 if (row != col) { | |
| 3345 error_sum += Abs(sum); | |
| 3346 } | |
| 3347 } | |
| 3348 } | |
| 3349 return error_sum; | |
| 3350 } | |
| 3351 | |
| 3352 } // namespace tesseract |
