diff mupdf-source/thirdparty/tesseract/src/ccstruct/statistc.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/statistc.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,638 @@
+/**********************************************************************
+ * File:        statistc.cpp  (Formerly stats.c)
+ * Description: Simple statistical package for integer values.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "statistc.h"
+
+#include "errcode.h"
+#include "scrollview.h"
+#include "tprintf.h"
+
+#include "helpers.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+namespace tesseract {
+
+/**********************************************************************
+ * STATS::STATS
+ *
+ * Construct a new stats element by allocating and zeroing the memory.
+ **********************************************************************/
+STATS::STATS(int32_t min_bucket_value, int32_t max_bucket_value) {
+  if (max_bucket_value < min_bucket_value) {
+    min_bucket_value = 0;
+    max_bucket_value = 1;
+  }
+  rangemin_ = min_bucket_value; // setup
+  rangemax_ = max_bucket_value;
+  buckets_ = new int32_t[1 + rangemax_ - rangemin_];
+  clear();
+}
+
+/**********************************************************************
+ * STATS::set_range
+ *
+ * Alter the range on an existing stats element.
+ **********************************************************************/
+bool STATS::set_range(int32_t min_bucket_value, int32_t max_bucket_value) {
+  if (max_bucket_value < min_bucket_value) {
+    return false;
+  }
+  if (rangemax_ - rangemin_ != max_bucket_value - min_bucket_value) {
+    delete[] buckets_;
+    buckets_ = new int32_t[1 + max_bucket_value - min_bucket_value];
+  }
+  rangemin_ = min_bucket_value; // setup
+  rangemax_ = max_bucket_value;
+  clear(); // zero it
+  return true;
+}
+
+/**********************************************************************
+ * STATS::clear
+ *
+ * Clear out the STATS class by zeroing all the buckets.
+ **********************************************************************/
+void STATS::clear() { // clear out buckets
+  total_count_ = 0;
+  if (buckets_ != nullptr) {
+    memset(buckets_, 0, (1 + rangemax_ - rangemin_) * sizeof(buckets_[0]));
+  }
+}
+
+/**********************************************************************
+ * STATS::~STATS
+ *
+ * Destructor for a stats class.
+ **********************************************************************/
+STATS::~STATS() {
+  delete[] buckets_;
+}
+
+/**********************************************************************
+ * STATS::add
+ *
+ * Add a set of samples to (or delete from) a pile.
+ **********************************************************************/
+void STATS::add(int32_t value, int32_t count) {
+  if (buckets_ != nullptr) {
+    value = ClipToRange(value, rangemin_, rangemax_);
+    buckets_[value - rangemin_] += count;
+    total_count_ += count; // keep count of total
+  }
+}
+
+/**********************************************************************
+ * STATS::mode
+ *
+ * Find the mode of a stats class.
+ **********************************************************************/
+int32_t STATS::mode() const { // get mode of samples
+  if (buckets_ == nullptr) {
+    return rangemin_;
+  }
+  int32_t max = buckets_[0]; // max cell count
+  int32_t maxindex = 0;      // index of max
+  for (int index = rangemax_ - rangemin_; index > 0; --index) {
+    if (buckets_[index] > max) {
+      max = buckets_[index]; // find biggest
+      maxindex = index;
+    }
+  }
+  return maxindex + rangemin_; // index of biggest
+}
+
+/**********************************************************************
+ * STATS::mean
+ *
+ * Find the mean of a stats class.
+ **********************************************************************/
+double STATS::mean() const { // get mean of samples
+  if (buckets_ == nullptr || total_count_ <= 0) {
+    return static_cast<double>(rangemin_);
+  }
+  int64_t sum = 0;
+  for (int index = rangemax_ - rangemin_; index >= 0; --index) {
+    sum += static_cast<int64_t>(index) * buckets_[index];
+  }
+  return static_cast<double>(sum) / total_count_ + rangemin_;
+}
+
+/**********************************************************************
+ * STATS::sd
+ *
+ * Find the standard deviation of a stats class.
+ **********************************************************************/
+double STATS::sd() const { // standard deviation
+  if (buckets_ == nullptr || total_count_ <= 0) {
+    return 0.0;
+  }
+  int64_t sum = 0;
+  double sqsum = 0.0;
+  for (int index = rangemax_ - rangemin_; index >= 0; --index) {
+    sum += static_cast<int64_t>(index) * buckets_[index];
+    sqsum += static_cast<double>(index) * index * buckets_[index];
+  }
+  double variance = static_cast<double>(sum) / total_count_;
+  variance = sqsum / total_count_ - variance * variance;
+  if (variance > 0.0) {
+    return sqrt(variance);
+  }
+  return 0.0;
+}
+
+/**********************************************************************
+ * STATS::ile
+ *
+ * Returns the fractile value such that frac fraction (in [0,1]) of samples
+ * has a value less than the return value.
+ **********************************************************************/
+double STATS::ile(double frac) const {
+  if (buckets_ == nullptr || total_count_ == 0) {
+    return static_cast<double>(rangemin_);
+  }
+#if 0
+  // TODO(rays) The existing code doesn't seem to be doing the right thing
+  // with target a double but this substitute crashes the code that uses it.
+  // Investigate and fix properly.
+  int target = IntCastRounded(frac * total_count_);
+  target = ClipToRange(target, 1, total_count_);
+#else
+  double target = frac * total_count_;
+  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
+#endif
+  int sum = 0;
+  int index = 0;
+  for (index = 0; index <= rangemax_ - rangemin_ && sum < target; sum += buckets_[index++]) {
+    ;
+  }
+  if (index > 0) {
+    ASSERT_HOST(buckets_[index - 1] > 0);
+    return rangemin_ + index - static_cast<double>(sum - target) / buckets_[index - 1];
+  } else {
+    return static_cast<double>(rangemin_);
+  }
+}
+
+/**********************************************************************
+ * STATS::min_bucket
+ *
+ * Find REAL minimum bucket - ile(0.0) isn't necessarily correct
+ **********************************************************************/
+int32_t STATS::min_bucket() const { // Find min
+  if (buckets_ == nullptr || total_count_ == 0) {
+    return rangemin_;
+  }
+  int32_t min = 0;
+  for (min = 0; (min <= rangemax_ - rangemin_) && (buckets_[min] == 0); min++) {
+    ;
+  }
+  return rangemin_ + min;
+}
+
+/**********************************************************************
+ * STATS::max_bucket
+ *
+ * Find REAL maximum bucket - ile(1.0) isn't necessarily correct
+ **********************************************************************/
+
+int32_t STATS::max_bucket() const { // Find max
+  if (buckets_ == nullptr || total_count_ == 0) {
+    return rangemin_;
+  }
+  int32_t max;
+  for (max = rangemax_ - rangemin_; max > 0 && buckets_[max] == 0; max--) {
+    ;
+  }
+  return rangemin_ + max;
+}
+
+/**********************************************************************
+ * STATS::median
+ *
+ * Finds a more useful estimate of median than ile(0.5).
+ *
+ * Overcomes a problem with ile() - if the samples are, for example,
+ * 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
+ * between 6 and 13 = 9.5
+ **********************************************************************/
+double STATS::median() const { // get median
+  if (buckets_ == nullptr) {
+    return static_cast<double>(rangemin_);
+  }
+  double median = ile(0.5);
+  int median_pile = static_cast<int>(floor(median));
+  if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
+    int32_t min_pile;
+    int32_t max_pile;
+    /* Find preceding non zero pile */
+    for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--) {
+      ;
+    }
+    /* Find following non zero pile */
+    for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++) {
+      ;
+    }
+    median = (min_pile + max_pile) / 2.0;
+  }
+  return median;
+}
+
+/**********************************************************************
+ * STATS::local_min
+ *
+ * Return true if this point is a local min.
+ **********************************************************************/
+bool STATS::local_min(int32_t x) const {
+  if (buckets_ == nullptr) {
+    return false;
+  }
+  x = ClipToRange(x, rangemin_, rangemax_) - rangemin_;
+  if (buckets_[x] == 0) {
+    return true;
+  }
+  int32_t index; // table index
+  for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index) {
+    ;
+  }
+  if (index >= 0 && buckets_[index] < buckets_[x]) {
+    return false;
+  }
+  for (index = x + 1; index <= rangemax_ - rangemin_ && buckets_[index] == buckets_[x]; ++index) {
+    ;
+  }
+  if (index <= rangemax_ - rangemin_ && buckets_[index] < buckets_[x]) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+/**********************************************************************
+ * STATS::smooth
+ *
+ * Apply a triangular smoothing filter to the stats.
+ * This makes the modes a bit more useful.
+ * The factor gives the height of the triangle, i.e. the weight of the
+ * centre.
+ **********************************************************************/
+void STATS::smooth(int32_t factor) {
+  if (buckets_ == nullptr || factor < 2) {
+    return;
+  }
+  STATS result(rangemin_, rangemax_);
+  int entrycount = 1 + rangemax_ - rangemin_;
+  for (int entry = 0; entry < entrycount; entry++) {
+    // centre weight
+    int count = buckets_[entry] * factor;
+    for (int offset = 1; offset < factor; offset++) {
+      if (entry - offset >= 0) {
+        count += buckets_[entry - offset] * (factor - offset);
+      }
+      if (entry + offset < entrycount) {
+        count += buckets_[entry + offset] * (factor - offset);
+      }
+    }
+    result.add(entry + rangemin_, count);
+  }
+  total_count_ = result.total_count_;
+  memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
+}
+
+/**********************************************************************
+ * STATS::cluster
+ *
+ * Cluster the samples into max_cluster clusters.
+ * Each call runs one iteration. The array of clusters must be
+ * max_clusters+1 in size as cluster 0 is used to indicate which samples
+ * have been used.
+ * The return value is the current number of clusters.
+ **********************************************************************/
+
+int32_t STATS::cluster(float lower, // thresholds
+                       float upper,
+                       float multiple,       // distance threshold
+                       int32_t max_clusters, // max no to make
+                       STATS *clusters) {    // array of clusters
+  bool new_cluster;                          // added one
+  float *centres;                            // cluster centres
+  int32_t entry;                             // bucket index
+  int32_t cluster;                           // cluster index
+  int32_t best_cluster;                      // one to assign to
+  int32_t new_centre = 0;                    // residual mode
+  int32_t new_mode;                          // pile count of new_centre
+  int32_t count;                             // pile to place
+  float dist;                                // from cluster
+  float min_dist;                            // from best_cluster
+  int32_t cluster_count;                     // no of clusters
+
+  if (buckets_ == nullptr || max_clusters < 1) {
+    return 0;
+  }
+  centres = new float[max_clusters + 1];
+  for (cluster_count = 1;
+       cluster_count <= max_clusters && clusters[cluster_count].buckets_ != nullptr &&
+       clusters[cluster_count].total_count_ > 0;
+       cluster_count++) {
+    centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));
+    new_centre = clusters[cluster_count].mode();
+    for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&
+                                 pile_count(entry) <= pile_count(entry + 1);
+         entry--) {
+      count = pile_count(entry) - clusters[0].pile_count(entry);
+      if (count > 0) {
+        clusters[cluster_count].add(entry, count);
+        clusters[0].add(entry, count);
+      }
+    }
+    for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry <= rangemax_ &&
+                                 pile_count(entry) <= pile_count(entry - 1);
+         entry++) {
+      count = pile_count(entry) - clusters[0].pile_count(entry);
+      if (count > 0) {
+        clusters[cluster_count].add(entry, count);
+        clusters[0].add(entry, count);
+      }
+    }
+  }
+  cluster_count--;
+
+  if (cluster_count == 0) {
+    clusters[0].set_range(rangemin_, rangemax_);
+  }
+  do {
+    new_cluster = false;
+    new_mode = 0;
+    for (entry = 0; entry <= rangemax_ - rangemin_; entry++) {
+      count = buckets_[entry] - clusters[0].buckets_[entry];
+      // remaining pile
+      if (count > 0) { // any to handle
+        min_dist = static_cast<float>(INT32_MAX);
+        best_cluster = 0;
+        for (cluster = 1; cluster <= cluster_count; cluster++) {
+          dist = entry + rangemin_ - centres[cluster];
+          // find distance
+          if (dist < 0) {
+            dist = -dist;
+          }
+          if (dist < min_dist) {
+            min_dist = dist; // find least
+            best_cluster = cluster;
+          }
+        }
+        if (min_dist > upper // far enough for new
+            && (best_cluster == 0 || entry + rangemin_ > centres[best_cluster] * multiple ||
+                entry + rangemin_ < centres[best_cluster] / multiple)) {
+          if (count > new_mode) {
+            new_mode = count;
+            new_centre = entry + rangemin_;
+          }
+        }
+      }
+    }
+    // need new and room
+    if (new_mode > 0 && cluster_count < max_clusters) {
+      cluster_count++;
+      new_cluster = true;
+      if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
+        delete[] centres;
+        return 0;
+      }
+      centres[cluster_count] = static_cast<float>(new_centre);
+      clusters[cluster_count].add(new_centre, new_mode);
+      clusters[0].add(new_centre, new_mode);
+      for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&
+                                   pile_count(entry) <= pile_count(entry + 1);
+           entry--) {
+        count = pile_count(entry) - clusters[0].pile_count(entry);
+        if (count > 0) {
+          clusters[cluster_count].add(entry, count);
+          clusters[0].add(entry, count);
+        }
+      }
+      for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry <= rangemax_ &&
+                                   pile_count(entry) <= pile_count(entry - 1);
+           entry++) {
+        count = pile_count(entry) - clusters[0].pile_count(entry);
+        if (count > 0) {
+          clusters[cluster_count].add(entry, count);
+          clusters[0].add(entry, count);
+        }
+      }
+      centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));
+    }
+  } while (new_cluster && cluster_count < max_clusters);
+  delete[] centres;
+  return cluster_count;
+}
+
+// Helper tests that the current index is still part of the peak and gathers
+// the data into the peak, returning false when the peak is ended.
+// src_buckets[index] - used_buckets[index] is the unused part of the histogram.
+// prev_count is the histogram count of the previous index on entry and is
+// updated to the current index on return.
+// total_count and total_value are accumulating the mean of the peak.
+static bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int *prev_count,
+                       int *total_count, double *total_value) {
+  int pile_count = src_buckets[index] - used_buckets[index];
+  if (pile_count <= *prev_count && pile_count > 0) {
+    // Accumulate count and index.count product.
+    *total_count += pile_count;
+    *total_value += index * pile_count;
+    // Mark this index as used
+    used_buckets[index] = src_buckets[index];
+    *prev_count = pile_count;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Finds (at most) the top max_modes modes, well actually the whole peak around
+// each mode, returning them in the given modes vector as a <mean of peak,
+// total count of peak> pair in order of decreasing total count.
+// Since the mean is the key and the count the data in the pair, a single call
+// to sort on the output will re-sort by increasing mean of peak if that is
+// more useful than decreasing total count.
+// Returns the actual number of modes found.
+int STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const {
+  if (max_modes <= 0) {
+    return 0;
+  }
+  int src_count = 1 + rangemax_ - rangemin_;
+  // Used copies the counts in buckets_ as they get used.
+  STATS used(rangemin_, rangemax_);
+  modes.clear();
+  // Total count of the smallest peak found so far.
+  int least_count = 1;
+  // Mode that is used as a seed for each peak
+  int max_count = 0;
+  do {
+    // Find an unused mode.
+    max_count = 0;
+    int max_index = 0;
+    for (int src_index = 0; src_index < src_count; src_index++) {
+      int pile_count = buckets_[src_index] - used.buckets_[src_index];
+      if (pile_count > max_count) {
+        max_count = pile_count;
+        max_index = src_index;
+      }
+    }
+    if (max_count > 0) {
+      // Copy the bucket count to used so it doesn't get found again.
+      used.buckets_[max_index] = max_count;
+      // Get the entire peak.
+      double total_value = max_index * max_count;
+      int total_count = max_count;
+      int prev_pile = max_count;
+      for (int offset = 1; max_index + offset < src_count; ++offset) {
+        if (!GatherPeak(max_index + offset, buckets_, used.buckets_, &prev_pile, &total_count,
+                        &total_value)) {
+          break;
+        }
+      }
+      prev_pile = buckets_[max_index];
+      for (int offset = 1; max_index - offset >= 0; ++offset) {
+        if (!GatherPeak(max_index - offset, buckets_, used.buckets_, &prev_pile, &total_count,
+                        &total_value)) {
+          break;
+        }
+      }
+      if (total_count > least_count || modes.size() < static_cast<size_t>(max_modes)) {
+        // We definitely want this mode, so if we have enough discard the least.
+        if (modes.size() == static_cast<size_t>(max_modes)) {
+          modes.resize(max_modes - 1);
+        }
+        size_t target_index = 0;
+        // Linear search for the target insertion point.
+        while (target_index < modes.size() && modes[target_index].data() >= total_count) {
+          ++target_index;
+        }
+        auto peak_mean = static_cast<float>(total_value / total_count + rangemin_);
+        modes.insert(modes.begin() + target_index, KDPairInc<float, int>(peak_mean, total_count));
+        least_count = modes.back().data();
+      }
+    }
+  } while (max_count > 0);
+  return modes.size();
+}
+
+/**********************************************************************
+ * STATS::print
+ *
+ * Prints a summary and table of the histogram.
+ **********************************************************************/
+void STATS::print() const {
+  if (buckets_ == nullptr) {
+    return;
+  }
+  int32_t min = min_bucket() - rangemin_;
+  int32_t max = max_bucket() - rangemin_;
+
+  int num_printed = 0;
+  for (int index = min; index <= max; index++) {
+    if (buckets_[index] != 0) {
+      tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
+      if (++num_printed % 8 == 0) {
+        tprintf("\n");
+      }
+    }
+  }
+  tprintf("\n");
+  print_summary();
+}
+
+/**********************************************************************
+ * STATS::print_summary
+ *
+ * Print a summary of the stats.
+ **********************************************************************/
+void STATS::print_summary() const {
+  if (buckets_ == nullptr) {
+    return;
+  }
+  int32_t min = min_bucket();
+  int32_t max = max_bucket();
+  tprintf("Total count=%d\n", total_count_);
+  tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
+  tprintf("Lower quartile=%.2f\n", ile(0.25));
+  tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
+  tprintf("Upper quartile=%.2f\n", ile(0.75));
+  tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
+  tprintf("Range=%d\n", max + 1 - min);
+  tprintf("Mean= %.2f\n", mean());
+  tprintf("SD= %.2f\n", sd());
+}
+
+/**********************************************************************
+ * STATS::plot
+ *
+ * Draw a histogram of the stats table.
+ **********************************************************************/
+
+#ifndef GRAPHICS_DISABLED
+void STATS::plot(ScrollView *window, // to draw in
+                 float xorigin,      // bottom left
+                 float yorigin,
+                 float xscale,                     // one x unit
+                 float yscale,                     // one y unit
+                 ScrollView::Color colour) const { // colour to draw in
+  if (buckets_ == nullptr) {
+    return;
+  }
+  window->Pen(colour);
+
+  for (int index = 0; index <= rangemax_ - rangemin_; index++) {
+    window->Rectangle(xorigin + xscale * index, yorigin, xorigin + xscale * (index + 1),
+                      yorigin + yscale * buckets_[index]);
+  }
+}
+#endif
+
+/**********************************************************************
+ * STATS::plotline
+ *
+ * Draw a histogram of the stats table. (Line only)
+ **********************************************************************/
+
+#ifndef GRAPHICS_DISABLED
+void STATS::plotline(ScrollView *window, // to draw in
+                     float xorigin,      // bottom left
+                     float yorigin,
+                     float xscale,                     // one x unit
+                     float yscale,                     // one y unit
+                     ScrollView::Color colour) const { // colour to draw in
+  if (buckets_ == nullptr) {
+    return;
+  }
+  window->Pen(colour);
+  window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
+  for (int index = 0; index <= rangemax_ - rangemin_; index++) {
+    window->DrawTo(xorigin + xscale * index, yorigin + yscale * buckets_[index]);
+  }
+}
+#endif
+
+} // namespace tesseract