diff mupdf-source/thirdparty/tesseract/src/classify/clusttool.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/classify/clusttool.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,312 @@
+/******************************************************************************
+ ** Filename: clusttool.cpp
+ ** Purpose:  Misc. tools for use with the clustering routines
+ ** Author:   Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#define _USE_MATH_DEFINES // for M_PI
+
+#include "clusttool.h"
+
+#include <cmath>   // for M_PI, std::isnan
+#include <locale>  // for std::locale::classic
+#include <sstream> // for std::stringstream
+
+namespace tesseract {
+
+//---------------Global Data Definitions and Declarations--------------------
+#define TOKENSIZE 80 ///< max size of tokens read from an input file
+#define QUOTED_TOKENSIZE "79"
+#define MAXSAMPLESIZE 65535 ///< max num of dimensions in feature space
+
+/**
+ * This routine reads N floats from the specified text file
+ * and places them into Buffer.  If Buffer is nullptr, a buffer
+ * is created and passed back to the caller.  If EOF is
+ * encountered before any floats can be read, nullptr is
+ * returned.
+ * @param fp open text file to read floats from
+ * @param N number of floats to read
+ * @param Buffer pointer to buffer to place floats into
+ * @return Pointer to buffer holding floats or nullptr if EOF
+ * @note Globals: None
+ */
+static bool ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {
+  const int kMaxLineSize = 1024;
+  char line[kMaxLineSize];
+  if (fp->FGets(line, kMaxLineSize) == nullptr) {
+    tprintf("Hit EOF in ReadNFloats!\n");
+    return false;
+  }
+
+  std::stringstream stream(line);
+  // Use "C" locale (needed for float values Buffer[i]).
+  stream.imbue(std::locale::classic());
+  for (uint16_t i = 0; i < N; i++) {
+    float f = NAN;
+    stream >> f;
+    if (std::isnan(f)) {
+      tprintf("Read of %u floats failed!\n", N);
+      return false;
+    }
+    Buffer[i] = f;
+  }
+  return true;
+}
+
+/**
+ * This routine writes a text representation of N floats from
+ * an array to a file.  All of the floats are placed on one line.
+ * @param File open text file to write N floats to
+ * @param N number of floats to write
+ * @param Array array of floats to write
+ */
+static void WriteNFloats(FILE *File, uint16_t N, float Array[]) {
+  for (int i = 0; i < N; i++) {
+    fprintf(File, " %9.6f", Array[i]);
+  }
+  fprintf(File, "\n");
+}
+
+/**
+ * This routine writes to the specified text file a word
+ * which represents the ProtoStyle.  It does not append
+ * a carriage return to the end.
+ * @param File open text file to write prototype style to
+ * @param ProtoStyle prototype style to write
+ */
+static void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
+  switch (ProtoStyle) {
+    case spherical:
+      fprintf(File, "spherical");
+      break;
+    case elliptical:
+      fprintf(File, "elliptical");
+      break;
+    case mixed:
+      fprintf(File, "mixed");
+      break;
+    case automatic:
+      fprintf(File, "automatic");
+      break;
+  }
+}
+
+/**
+ * This routine reads a single integer from the specified
+ * file and checks to ensure that it is between 0 and
+ * MAXSAMPLESIZE.
+ * @param fp open text file to read sample size from
+ * @return Sample size
+ * @note Globals: None
+ */
+uint16_t ReadSampleSize(TFile *fp) {
+  int SampleSize = 0;
+
+  const int kMaxLineSize = 100;
+  char line[kMaxLineSize];
+  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
+  ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1);
+  ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);
+  return SampleSize;
+}
+
+/**
+ * This routine reads textual descriptions of sets of parameters
+ * which describe the characteristics of feature dimensions.
+ *
+ * @param fp open text file to read N parameter descriptions from
+ * @param N number of parameter descriptions to read
+ * @return Pointer to an array of parameter descriptors.
+ * @note Globals: None
+ */
+PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {
+  auto ParamDesc = new PARAM_DESC[N];
+  for (int i = 0; i < N; i++) {
+    const int kMaxLineSize = TOKENSIZE * 4;
+    char line[kMaxLineSize];
+    ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
+    std::istringstream stream(line);
+    // Use "C" locale (needed for float values Min, Max).
+    stream.imbue(std::locale::classic());
+    std::string linear_token;
+    stream >> linear_token;
+    std::string essential_token;
+    stream >> essential_token;
+    stream >> ParamDesc[i].Min;
+    stream >> ParamDesc[i].Max;
+    ASSERT_HOST(!stream.fail());
+    ParamDesc[i].Circular = (linear_token[0] == 'c');
+    ParamDesc[i].NonEssential = (essential_token[0] != 'e');
+    ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
+    ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
+    ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
+  }
+  return (ParamDesc);
+}
+
+/**
+ * This routine reads a textual description of a prototype from
+ * the specified file.
+ *
+ * @param fp open text file to read prototype from
+ * @param N number of dimensions used in prototype
+ * @return List of prototypes
+ * @note Globals: None
+ */
+PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {
+  char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
+  int SampleCount;
+  int i;
+
+  const int kMaxLineSize = TOKENSIZE * 4;
+  char line[kMaxLineSize];
+  if (fp->FGets(line, kMaxLineSize) == nullptr ||
+      sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d", sig_token, shape_token,
+             &SampleCount) != 3) {
+    tprintf("Invalid prototype: %s\n", line);
+    return nullptr;
+  }
+  auto Proto = new PROTOTYPE;
+  Proto->Cluster = nullptr;
+  Proto->Significant = (sig_token[0] == 's');
+
+  switch (shape_token[0]) {
+    case 's':
+      Proto->Style = spherical;
+      break;
+    case 'e':
+      Proto->Style = elliptical;
+      break;
+    case 'a':
+      Proto->Style = automatic;
+      break;
+    default:
+      tprintf("Invalid prototype style specification:%s\n", shape_token);
+      Proto->Style = elliptical;
+  }
+
+  ASSERT_HOST(SampleCount >= 0);
+  Proto->NumSamples = SampleCount;
+
+  Proto->Mean.resize(N);
+  ReadNFloats(fp, N, &Proto->Mean[0]);
+
+  switch (Proto->Style) {
+    case spherical:
+      ReadNFloats(fp, 1, &(Proto->Variance.Spherical));
+      Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
+      Proto->TotalMagnitude = std::pow(Proto->Magnitude.Spherical, static_cast<float>(N));
+      Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
+      Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
+      Proto->Distrib.clear();
+      break;
+    case elliptical:
+      Proto->Variance.Elliptical = new float[N];
+      ReadNFloats(fp, N, Proto->Variance.Elliptical);
+      Proto->Magnitude.Elliptical = new float[N];
+      Proto->Weight.Elliptical = new float[N];
+      Proto->TotalMagnitude = 1.0;
+      for (i = 0; i < N; i++) {
+        Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
+        Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
+        Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
+      }
+      Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
+      Proto->Distrib.clear();
+      break;
+    default:
+      delete Proto;
+      tprintf("Invalid prototype style\n");
+      return nullptr;
+  }
+  return Proto;
+}
+
+/**
+ * This routine writes an array of dimension descriptors to
+ * the specified text file.
+ * @param File open text file to write param descriptors to
+ * @param N number of param descriptors to write
+ * @param ParamDesc array of param descriptors to write
+ */
+void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {
+  int i;
+
+  for (i = 0; i < N; i++) {
+    if (ParamDesc[i].Circular) {
+      fprintf(File, "circular ");
+    } else {
+      fprintf(File, "linear   ");
+    }
+
+    if (ParamDesc[i].NonEssential) {
+      fprintf(File, "non-essential ");
+    } else {
+      fprintf(File, "essential     ");
+    }
+
+    fprintf(File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
+  }
+}
+
+/**
+ * This routine writes a textual description of a prototype
+ * to the specified text file.
+ * @param File open text file to write prototype to
+ * @param N number of dimensions in feature space
+ * @param Proto prototype to write out
+ */
+void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {
+  int i;
+
+  if (Proto->Significant) {
+    fprintf(File, "significant   ");
+  } else {
+    fprintf(File, "insignificant ");
+  }
+  WriteProtoStyle(File, static_cast<PROTOSTYLE>(Proto->Style));
+  fprintf(File, "%6u\n\t", Proto->NumSamples);
+  WriteNFloats(File, N, &Proto->Mean[0]);
+  fprintf(File, "\t");
+
+  switch (Proto->Style) {
+    case spherical:
+      WriteNFloats(File, 1, &(Proto->Variance.Spherical));
+      break;
+    case elliptical:
+      WriteNFloats(File, N, Proto->Variance.Elliptical);
+      break;
+    case mixed:
+      for (i = 0; i < N; i++) {
+        switch (Proto->Distrib[i]) {
+          case normal:
+            fprintf(File, " %9s", "normal");
+            break;
+          case uniform:
+            fprintf(File, " %9s", "uniform");
+            break;
+          case D_random:
+            fprintf(File, " %9s", "random");
+            break;
+          case DISTRIBUTION_COUNT:
+            ASSERT_HOST(!"Distribution count not allowed!");
+        }
+      }
+      fprintf(File, "\n\t");
+      WriteNFloats(File, N, Proto->Variance.Elliptical);
+  }
+}
+
+} // namespace tesseract