Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/classify/cluster.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/classify/cluster.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,144 @@ +/****************************************************************************** + ** Filename: cluster.h + ** Purpose: Definition of feature space clustering routines + ** Author: Dan Johnson + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + *****************************************************************************/ + +#ifndef CLUSTER_H +#define CLUSTER_H + +#include "kdtree.h" +#include "oldlist.h" + +namespace tesseract { + +struct BUCKETS; + +#define MINBUCKETS 5 +#define MAXBUCKETS 39 + +/*---------------------------------------------------------------------- + Types +----------------------------------------------------------------------*/ +struct CLUSTER { + CLUSTER(size_t n) : Mean(n) { + } + + ~CLUSTER() { + delete Left; + delete Right; + } + + bool Clustered : 1; // true if included in a higher cluster + bool Prototype : 1; // true if cluster represented by a proto + unsigned SampleCount : 30; // number of samples in this cluster + CLUSTER *Left; // ptr to left sub-cluster + CLUSTER *Right; // ptr to right sub-cluster + int32_t CharID; // identifier of char sample came from + std::vector<float> Mean; // mean of cluster - SampleSize floats +}; +using SAMPLE = CLUSTER; // can refer to as either sample or cluster + +typedef enum { spherical, elliptical, mixed, automatic } PROTOSTYLE; + +struct CLUSTERCONFIG { // parameters to control clustering + PROTOSTYLE ProtoStyle; // specifies types of protos to be made + float MinSamples; // min # of samples per proto - % of total + float MaxIllegal; // max percentage of samples in a cluster which + // have more than 1 feature in that cluster + float Independence; // desired independence between dimensions + double Confidence; // desired confidence in prototypes created + int MagicSamples; // Ideal number of samples in a cluster. +}; + +typedef enum { normal, uniform, D_random, DISTRIBUTION_COUNT } DISTRIBUTION; + +union FLOATUNION { + float Spherical; + float *Elliptical; +}; + +struct PROTOTYPE { + bool Significant : 1; // true if prototype is significant + bool Merged : 1; // Merged after clustering so do not output + // but kept for display purposes. If it has no + // samples then it was actually merged. + // Otherwise it matched an already significant + // cluster. + unsigned Style : 2; // spherical, elliptical, or mixed + unsigned NumSamples : 28; // number of samples in the cluster + CLUSTER *Cluster; // ptr to cluster which made prototype + std::vector<DISTRIBUTION> Distrib; // different distribution for each dimension + std::vector<float> Mean; // prototype mean + float TotalMagnitude; // total magnitude over all dimensions + float LogMagnitude; // log base e of TotalMagnitude + FLOATUNION Variance; // prototype variance + FLOATUNION Magnitude; // magnitude of density function + FLOATUNION Weight; // weight of density function +}; + +struct CLUSTERER { + int16_t SampleSize; // number of parameters per sample + PARAM_DESC *ParamDesc; // description of each parameter + int32_t NumberOfSamples; // total number of samples being clustered + KDTREE *KDTree; // for optimal nearest neighbor searching + CLUSTER *Root; // ptr to root cluster of cluster tree + LIST ProtoList; // list of prototypes + uint32_t NumChar; // # of characters represented by samples + // cache of reusable histograms by distribution type and number of buckets. + BUCKETS *bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS]; +}; + +struct SAMPLELIST { + int32_t NumSamples; // number of samples in list + int32_t MaxNumSamples; // maximum size of list + SAMPLE *Sample[1]; // array of ptrs to sample data structures +}; + +// low level cluster tree analysis routines. +#define InitSampleSearch(S, C) (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C)))) + +/*-------------------------------------------------------------------------- + Public Function Prototypes +--------------------------------------------------------------------------*/ +TESS_API +CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]); + +TESS_API +SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID); + +TESS_API +LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); + +TESS_API +void FreeClusterer(CLUSTERER *Clusterer); + +TESS_API +void FreeProtoList(LIST *ProtoList); + +void FreePrototype(void *arg); // PROTOTYPE *Prototype); + +CLUSTER *NextSample(LIST *SearchState); + +float Mean(PROTOTYPE *Proto, uint16_t Dimension); + +float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension); + +TESS_API +int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], + float m1[], float m2[]); + +} // namespace tesseract + +#endif
