diff mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,214 @@
+/******************************************************************************
+ ** Filename:    normmatch.c
+ ** Purpose:     Simple matcher based on character normalization features.
+ ** Author:      Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+/*----------------------------------------------------------------------------
+          Include Files and Type Defines
+----------------------------------------------------------------------------*/
+#include "normmatch.h"
+
+#include "classify.h"
+#include "clusttool.h"
+#include "helpers.h"
+#include "normfeat.h"
+#include "params.h"
+#include "unicharset.h"
+
+#include <cmath>
+#include <cstdio>
+#include <sstream> // for std::istringstream
+
+namespace tesseract {
+
+struct NORM_PROTOS {
+  NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {
+  }
+  int NumParams = 0;
+  int NumProtos;
+  PARAM_DESC *ParamDesc = nullptr;
+  std::vector<LIST> Protos;
+};
+
+/*----------------------------------------------------------------------------
+              Private Code
+----------------------------------------------------------------------------*/
+
+/**
+ * @name NormEvidenceOf
+ *
+ * Return the new type of evidence number corresponding to this
+ * normalization adjustment.  The equation that represents the transform is:
+ *       1 / (1 + (NormAdj / midpoint) ^ curl)
+ */
+static float NormEvidenceOf(float NormAdj) {
+  NormAdj /= static_cast<float>(classify_norm_adj_midpoint);
+
+  if (classify_norm_adj_curl == 3) {
+    NormAdj = NormAdj * NormAdj * NormAdj;
+  } else if (classify_norm_adj_curl == 2) {
+    NormAdj = NormAdj * NormAdj;
+  } else {
+    NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl));
+  }
+  return (1 / (1 + NormAdj));
+}
+
+/*----------------------------------------------------------------------------
+        Variables
+----------------------------------------------------------------------------*/
+
+/** control knobs used to control the normalization adjustment process */
+double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
+double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
+/** Weight of width variance against height and vertical position. */
+const float kWidthErrorWeighting = 0.125f;
+
+/*----------------------------------------------------------------------------
+              Public Code
+----------------------------------------------------------------------------*/
+/**
+ * This routine compares Features against each character
+ * normalization proto for ClassId and returns the match
+ * rating of the best match.
+ * @param ClassId id of class to match against
+ * @param feature character normalization feature
+ * @param DebugMatch controls dump of debug info
+ *
+ * Globals:
+ * #NormProtos character normalization prototypes
+ *
+ * @return Best match rating for Feature against protos of ClassId.
+ */
+float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {
+  if (ClassId >= NormProtos->NumProtos) {
+    ClassId = NO_CLASS;
+  }
+
+  /* handle requests for classification as noise */
+  if (ClassId == NO_CLASS) {
+    /* kludge - clean up constants and make into control knobs later */
+    float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
+                   feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
+                   feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
+    return (1 - NormEvidenceOf(Match));
+  }
+
+  if (DebugMatch) {
+    tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
+  }
+
+  LIST Protos = NormProtos->Protos[ClassId];
+  if (Protos == nullptr) {
+     // Avoid FP overflow in NormEvidenceOf.
+     return 1.0f;
+  }
+
+  float BestMatch = FLT_MAX;
+  iterate(Protos) {
+    auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
+    float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
+    float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
+    if (DebugMatch) {
+      tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
+              Proto->Weight.Elliptical[CharNormY], Match);
+    }
+    Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
+    Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
+    if (DebugMatch) {
+      tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
+              Proto->Weight.Elliptical[CharNormRx], Match);
+    }
+    // Ry is width! See intfx.cpp.
+    Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
+    if (DebugMatch) {
+      tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
+              Proto->Weight.Elliptical[CharNormRy]);
+    }
+    Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
+    Delta *= kWidthErrorWeighting;
+    Match += Delta;
+    if (DebugMatch) {
+      tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
+              Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
+              256 * (1 - NormEvidenceOf(Match)));
+    }
+
+    if (Match < BestMatch) {
+      BestMatch = Match;
+    }
+  }
+  return 1 - NormEvidenceOf(BestMatch);
+} /* ComputeNormMatch */
+
+void Classify::FreeNormProtos() {
+  if (NormProtos != nullptr) {
+    for (int i = 0; i < NormProtos->NumProtos; i++) {
+      FreeProtoList(&NormProtos->Protos[i]);
+    }
+    delete[] NormProtos->ParamDesc;
+    delete NormProtos;
+    NormProtos = nullptr;
+  }
+}
+
+/**
+ * This routine allocates a new data structure to hold
+ * a set of character normalization protos.  It then fills in
+ * the data structure by reading from the specified File.
+ * @param fp open text file to read normalization protos from
+ * Globals: none
+ * @return Character normalization protos.
+ */
+NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
+  char unichar[2 * UNICHAR_LEN + 1];
+  UNICHAR_ID unichar_id;
+  LIST Protos;
+  int NumProtos;
+
+  /* allocate and initialization data structure */
+  auto NormProtos = new NORM_PROTOS(unicharset.size());
+
+  /* read file header and save in data structure */
+  NormProtos->NumParams = ReadSampleSize(fp);
+  NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
+
+  /* read protos for each class into a separate list */
+  const int kMaxLineSize = 100;
+  char line[kMaxLineSize];
+  while (fp->FGets(line, kMaxLineSize) != nullptr) {
+    std::istringstream stream(line);
+    stream.imbue(std::locale::classic());
+    stream >> unichar >> NumProtos;
+    if (stream.fail()) {
+      continue;
+    }
+    if (unicharset.contains_unichar(unichar)) {
+      unichar_id = unicharset.unichar_to_id(unichar);
+      Protos = NormProtos->Protos[unichar_id];
+      for (int i = 0; i < NumProtos; i++) {
+        Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
+      }
+      NormProtos->Protos[unichar_id] = Protos;
+    } else {
+      tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
+      for (int i = 0; i < NumProtos; i++) {
+        FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
+      }
+    }
+  }
+  return NormProtos;
+} /* ReadNormProtos */
+
+} // namespace tesseract