diff mupdf-source/thirdparty/tesseract/src/training/common/intfeaturedist.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/common/intfeaturedist.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,162 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        intfeaturedist.cpp
+// Description: Fast set-difference-based feature distance calculator.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "intfeaturedist.h"
+#include "intfeaturemap.h"
+
+namespace tesseract {
+
+IntFeatureDist::IntFeatureDist()
+    : size_(0)
+    , total_feature_weight_(0.0)
+    , feature_map_(nullptr)
+    , features_(nullptr)
+    , features_delta_one_(nullptr)
+    , features_delta_two_(nullptr) {}
+
+IntFeatureDist::~IntFeatureDist() {
+  Clear();
+}
+
+// Initialize the table to the given size of feature space.
+void IntFeatureDist::Init(const IntFeatureMap *feature_map) {
+  size_ = feature_map->sparse_size();
+  Clear();
+  feature_map_ = feature_map;
+  features_ = new bool[size_];
+  features_delta_one_ = new bool[size_];
+  features_delta_two_ = new bool[size_];
+  memset(features_, false, size_ * sizeof(features_[0]));
+  memset(features_delta_one_, false, size_ * sizeof(features_delta_one_[0]));
+  memset(features_delta_two_, false, size_ * sizeof(features_delta_two_[0]));
+  total_feature_weight_ = 0.0;
+}
+
+// Setup the map for the given indexed_features that have been indexed by
+// feature_map.
+void IntFeatureDist::Set(const std::vector<int> &indexed_features, int canonical_count,
+                         bool value) {
+  total_feature_weight_ = canonical_count;
+  for (int f : indexed_features) {
+    features_[f] = value;
+    for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
+      if (dir == 0) {
+        continue;
+      }
+      const int mapped_f = feature_map_->OffsetFeature(f, dir);
+      if (mapped_f >= 0) {
+        features_delta_one_[mapped_f] = value;
+        for (int dir2 = -kNumOffsetMaps; dir2 <= kNumOffsetMaps; ++dir2) {
+          if (dir2 == 0) {
+            continue;
+          }
+          const int mapped_f2 = feature_map_->OffsetFeature(mapped_f, dir2);
+          if (mapped_f2 >= 0) {
+            features_delta_two_[mapped_f2] = value;
+          }
+        }
+      }
+    }
+  }
+}
+
+// Compute the distance between the given feature vector and the last
+// Set feature vector.
+double IntFeatureDist::FeatureDistance(const std::vector<int> &features) const {
+  const int num_test_features = features.size();
+  const double denominator = total_feature_weight_ + num_test_features;
+  double misses = denominator;
+  for (int i = 0; i < num_test_features; ++i) {
+    const int index = features[i];
+    const double weight = 1.0;
+    if (features_[index]) {
+      // A perfect match.
+      misses -= 2.0 * weight;
+    } else if (features_delta_one_[index]) {
+      misses -= 1.5 * weight;
+    } else if (features_delta_two_[index]) {
+      // A near miss.
+      misses -= 1.0 * weight;
+    }
+  }
+  return misses / denominator;
+}
+
+// Compute the distance between the given feature vector and the last
+// Set feature vector.
+double IntFeatureDist::DebugFeatureDistance(const std::vector<int> &features) const {
+  const int num_test_features = features.size();
+  const double denominator = total_feature_weight_ + num_test_features;
+  double misses = denominator;
+  for (int i = 0; i < num_test_features; ++i) {
+    const int index = features[i];
+    const double weight = 1.0;
+    INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(features[i]);
+    tprintf("Testing feature weight %g:", weight);
+    f.print();
+    if (features_[index]) {
+      // A perfect match.
+      misses -= 2.0 * weight;
+      tprintf("Perfect hit\n");
+    } else if (features_delta_one_[index]) {
+      misses -= 1.5 * weight;
+      tprintf("-1 hit\n");
+    } else if (features_delta_two_[index]) {
+      // A near miss.
+      misses -= 1.0 * weight;
+      tprintf("-2 hit\n");
+    } else {
+      tprintf("Total miss\n");
+    }
+  }
+  tprintf("Features present:");
+  for (int i = 0; i < size_; ++i) {
+    if (features_[i]) {
+      INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
+      f.print();
+    }
+  }
+  tprintf("\nMinus one features:");
+  for (int i = 0; i < size_; ++i) {
+    if (features_delta_one_[i]) {
+      INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
+      f.print();
+    }
+  }
+  tprintf("\nMinus two features:");
+  for (int i = 0; i < size_; ++i) {
+    if (features_delta_two_[i]) {
+      INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
+      f.print();
+    }
+  }
+  tprintf("\n");
+  return misses / denominator;
+}
+
+// Clear all data.
+void IntFeatureDist::Clear() {
+  delete[] features_;
+  features_ = nullptr;
+  delete[] features_delta_one_;
+  features_delta_one_ = nullptr;
+  delete[] features_delta_two_;
+  features_delta_two_ = nullptr;
+}
+
+} // namespace tesseract