comparison mupdf-source/thirdparty/tesseract/src/training/common/intfeaturemap.h @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 ///////////////////////////////////////////////////////////////////////
4 // File: intfeaturemap.h
5 // Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
6 // to provide a subspace mapping and fast feature lookup.
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H_
21 #define TESSERACT_CLASSIFY_INTFEATUREMAP_H_
22
23 #include "export.h"
24 #include "indexmapbidi.h"
25 #include "intfeaturespace.h"
26 #include "intproto.h"
27
28 namespace tesseract {
29
30 class SampleIterator;
31
32 // Number of positive and negative offset maps.
33 static const int kNumOffsetMaps = 2;
34
35 // Class to map a feature space defined by INT_FEATURE_STRUCT to a compact
36 // down-sampled subspace of actually used features.
37 // The IntFeatureMap copes with 2 stages of transformation:
38 // The first step is down-sampling (re-quantization) and converting to a
39 // single index value from the 3-D input:
40 // INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and
41 // the second is a feature-space compaction to map only the feature indices
42 // that are actually used. This saves space in classifiers that are built
43 // using the mapped feature space.
44 // index (sparse) feature <-> map (compact) feature via IndexMapBiDi.
45 // Although the transformations are reversible, the inverses are lossy and do
46 // not return the exact input INT_FEATURE_STRUCT, due to the many->one nature
47 // of both transformations.
48 class TESS_COMMON_TRAINING_API IntFeatureMap {
49 public:
50 IntFeatureMap();
51 ~IntFeatureMap();
52
53 // Accessors.
54 int sparse_size() const {
55 return feature_space_.Size();
56 }
57 int compact_size() const {
58 return compact_size_;
59 }
60 const IntFeatureSpace &feature_space() const {
61 return feature_space_;
62 }
63 const IndexMapBiDi &feature_map() const {
64 return feature_map_;
65 }
66
67 // Pseudo-accessors.
68 int IndexFeature(const INT_FEATURE_STRUCT &f) const;
69 int MapFeature(const INT_FEATURE_STRUCT &f) const;
70 int MapIndexFeature(int index_feature) const;
71 INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;
72 INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;
73 void DeleteMapFeature(int map_feature);
74 bool IsMapFeatureDeleted(int map_feature) const;
75
76 // Copies the given feature_space and uses it as the index feature map
77 // from INT_FEATURE_STRUCT.
78 void Init(const IntFeatureSpace &feature_space);
79
80 // Helper to return an offset index feature. In this context an offset
81 // feature with a dir of +/-1 is a feature of a similar direction,
82 // but shifted perpendicular to the direction of the feature. An offset
83 // feature with a dir of +/-2 is feature at the same position, but rotated
84 // by +/- one [compact] quantum. Returns the index of the generated offset
85 // feature, or -1 if it doesn't exist. Dir should be in
86 // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
87 // A dir of 0 is an identity transformation.
88 // Both input and output are from the index(sparse) feature space, not
89 // the mapped/compact feature space, but the offset feature is the minimum
90 // distance moved from the input to guarantee that it maps to the next
91 // available quantum in the mapped/compact space.
92 int OffsetFeature(int index_feature, int dir) const;
93
94 // Computes the features used by the subset of samples defined by
95 // the iterator and sets up the feature mapping.
96 // Returns the size of the compacted feature space.
97 int FindNZFeatureMapping(SampleIterator *it);
98
99 // After deleting some features, finish setting up the mapping, and map
100 // all the samples. Returns the size of the compacted feature space.
101 int FinalizeMapping(SampleIterator *it);
102
103 // Indexes the given array of features to a vector of sorted indices.
104 void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features,
105 std::vector<int> *sorted_features) const {
106 feature_space_.IndexAndSortFeatures(features, num_features, sorted_features);
107 }
108 // Maps the given array of index/sparse features to an array of map/compact
109 // features.
110 // Assumes the input is sorted. The output indices are sorted and uniqued.
111 // Returns the number of "missed" features, being features that
112 // don't map to the compact feature space.
113 int MapIndexedFeatures(const std::vector<int> &index_features,
114 std::vector<int> *map_features) const {
115 return feature_map_.MapFeatures(index_features, map_features);
116 }
117
118 // Prints the map features from the set in human-readable form.
119 void DebugMapFeatures(const std::vector<int> &map_features) const;
120
121 private:
122 void Clear();
123
124 // Helper to compute an offset index feature. In this context an offset
125 // feature with a dir of +/-1 is a feature of a similar direction,
126 // but shifted perpendicular to the direction of the feature. An offset
127 // feature with a dir of +/-2 is feature at the same position, but rotated
128 // by +/- one [compact] quantum. Returns the index of the generated offset
129 // feature, or -1 if it doesn't exist. Dir should be in
130 // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
131 // A dir of 0 is an identity transformation.
132 // Both input and output are from the index(sparse) feature space, not
133 // the mapped/compact feature space, but the offset feature is the minimum
134 // distance moved from the input to guarantee that it maps to the next
135 // available quantum in the mapped/compact space.
136 int ComputeOffsetFeature(int index_feature, int dir) const;
137
138 // True if the mapping has changed since it was last finalized.
139 bool mapping_changed_;
140 // Size of the compacted feature space, after unused features are removed.
141 int compact_size_;
142 // Feature space quantization definition and indexing from INT_FEATURE_STRUCT.
143 IntFeatureSpace feature_space_;
144 // Mapping from indexed feature space to the compacted space with unused
145 // features mapping to -1.
146 IndexMapBiDi feature_map_;
147 // Index tables to map a feature index to the corresponding feature after a
148 // shift perpendicular to the feature direction, or a rotation in place.
149 // An entry of -1 indicates that there is no corresponding feature.
150 // Array of arrays of size feature_space_.Size() owned by this class.
151 int *offset_plus_[kNumOffsetMaps];
152 int *offset_minus_[kNumOffsetMaps];
153
154 // Don't use default copy and assign!
155 IntFeatureMap(const IntFeatureMap &);
156 void operator=(const IntFeatureMap &);
157 };
158
159 } // namespace tesseract.
160
161 #endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H_