view mupdf-source/thirdparty/tesseract/src/training/common/intfeaturemap.h @ 17:dd9cdb856310

Remove PKG-INFO from the because it is regenerated automatically for the sdist
author Franz Glasner <fzglas.hg@dom66.de>
date Thu, 18 Sep 2025 17:40:40 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File:        intfeaturemap.h
// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
//              to provide a subspace mapping and fast feature lookup.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H_
#define TESSERACT_CLASSIFY_INTFEATUREMAP_H_

#include "export.h"
#include "indexmapbidi.h"
#include "intfeaturespace.h"
#include "intproto.h"

namespace tesseract {

class SampleIterator;

// Number of positive and negative offset maps.
static const int kNumOffsetMaps = 2;

// Class to map a feature space defined by INT_FEATURE_STRUCT to a compact
// down-sampled subspace of actually used features.
// The IntFeatureMap copes with 2 stages of transformation:
// The first step is down-sampling (re-quantization) and converting to a
// single index value from the 3-D input:
//   INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and
// the second is a feature-space compaction to map only the feature indices
// that are actually used. This saves space in classifiers that are built
// using the mapped feature space.
//   index (sparse) feature <-> map (compact) feature via IndexMapBiDi.
// Although the transformations are reversible, the inverses are lossy and do
// not return the exact input INT_FEATURE_STRUCT, due to the many->one nature
// of both transformations.
class TESS_COMMON_TRAINING_API IntFeatureMap {
public:
  IntFeatureMap();
  ~IntFeatureMap();

  // Accessors.
  int sparse_size() const {
    return feature_space_.Size();
  }
  int compact_size() const {
    return compact_size_;
  }
  const IntFeatureSpace &feature_space() const {
    return feature_space_;
  }
  const IndexMapBiDi &feature_map() const {
    return feature_map_;
  }

  // Pseudo-accessors.
  int IndexFeature(const INT_FEATURE_STRUCT &f) const;
  int MapFeature(const INT_FEATURE_STRUCT &f) const;
  int MapIndexFeature(int index_feature) const;
  INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;
  INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;
  void DeleteMapFeature(int map_feature);
  bool IsMapFeatureDeleted(int map_feature) const;

  // Copies the given feature_space and uses it as the index feature map
  // from INT_FEATURE_STRUCT.
  void Init(const IntFeatureSpace &feature_space);

  // Helper to return an offset index feature. In this context an offset
  // feature with a dir of +/-1 is a feature of a similar direction,
  // but shifted perpendicular to the direction of the feature. An offset
  // feature with a dir of +/-2 is feature at the same position, but rotated
  // by +/- one [compact] quantum. Returns the index of the generated offset
  // feature, or -1 if it doesn't exist. Dir should be in
  // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
  // A dir of 0 is an identity transformation.
  // Both input and output are from the index(sparse) feature space, not
  // the mapped/compact feature space, but the offset feature is the minimum
  // distance moved from the input to guarantee that it maps to the next
  // available quantum in the mapped/compact space.
  int OffsetFeature(int index_feature, int dir) const;

  // Computes the features used by the subset of samples defined by
  // the iterator and sets up the feature mapping.
  // Returns the size of the compacted feature space.
  int FindNZFeatureMapping(SampleIterator *it);

  // After deleting some features, finish setting up the mapping, and map
  // all the samples. Returns the size of the compacted feature space.
  int FinalizeMapping(SampleIterator *it);

  // Indexes the given array of features to a vector of sorted indices.
  void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features,
                            std::vector<int> *sorted_features) const {
    feature_space_.IndexAndSortFeatures(features, num_features, sorted_features);
  }
  // Maps the given array of index/sparse features to an array of map/compact
  // features.
  // Assumes the input is sorted. The output indices are sorted and uniqued.
  // Returns the number of "missed" features, being features that
  // don't map to the compact feature space.
  int MapIndexedFeatures(const std::vector<int> &index_features,
                         std::vector<int> *map_features) const {
    return feature_map_.MapFeatures(index_features, map_features);
  }

  // Prints the map features from the set in human-readable form.
  void DebugMapFeatures(const std::vector<int> &map_features) const;

private:
  void Clear();

  // Helper to compute an offset index feature. In this context an offset
  // feature with a dir of +/-1 is a feature of a similar direction,
  // but shifted perpendicular to the direction of the feature. An offset
  // feature with a dir of +/-2 is feature at the same position, but rotated
  // by +/- one [compact] quantum. Returns the index of the generated offset
  // feature, or -1 if it doesn't exist. Dir should be in
  // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
  // A dir of 0 is an identity transformation.
  // Both input and output are from the index(sparse) feature space, not
  // the mapped/compact feature space, but the offset feature is the minimum
  // distance moved from the input to guarantee that it maps to the next
  // available quantum in the mapped/compact space.
  int ComputeOffsetFeature(int index_feature, int dir) const;

  // True if the mapping has changed since it was last finalized.
  bool mapping_changed_;
  // Size of the compacted feature space, after unused features are removed.
  int compact_size_;
  // Feature space quantization definition and indexing from INT_FEATURE_STRUCT.
  IntFeatureSpace feature_space_;
  // Mapping from indexed feature space to the compacted space with unused
  // features mapping to -1.
  IndexMapBiDi feature_map_;
  // Index tables to map a feature index to the corresponding feature after a
  // shift perpendicular to the feature direction, or a rotation in place.
  // An entry of -1 indicates that there is no corresponding feature.
  // Array of arrays of size feature_space_.Size() owned by this class.
  int *offset_plus_[kNumOffsetMaps];
  int *offset_minus_[kNumOffsetMaps];

  // Don't use default copy and assign!
  IntFeatureMap(const IntFeatureMap &);
  void operator=(const IntFeatureMap &);
};

} // namespace tesseract.

#endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H_