diff mupdf-source/thirdparty/tesseract/src/training/mergenf.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/mergenf.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,322 @@
+/******************************************************************************
+**  Filename:    MergeNF.c
+**  Purpose:     Program for merging similar nano-feature protos
+**  Author:      Dan Johnson
+**
+** (c) Copyright Hewlett-Packard Company, 1988.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+******************************************************************************/
+
+#define _USE_MATH_DEFINES // for M_PI
+#include <algorithm>
+#include <cfloat> // for FLT_MAX
+#include <cmath>  // for M_PI
+#include <cstdio>
+#include <cstring>
+
+#include "cluster.h"
+#include "clusttool.h"
+#include "featdefs.h"
+#include "intproto.h"
+#include "mergenf.h"
+#include "ocrfeatures.h"
+#include "oldlist.h"
+#include "params.h"
+#include "protos.h"
+
+using namespace tesseract;
+
+/*-------------------once in subfeat---------------------------------*/
+static double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
+
+static double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
+
+static double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
+
+/*-----------------------------once in
+ * fasttrain----------------------------------*/
+static double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
+
+static double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
+
+static double_VAR(training_angle_pad, 45.0, "Angle pad ...");
+
+/**
+ * Compare protos p1 and p2 and return an estimate of the
+ * worst evidence rating that will result for any part of p1
+ * that is compared to p2.  In other words, if p1 were broken
+ * into pico-features and each pico-feature was matched to p2,
+ * what is the worst evidence rating that will be achieved for
+ * any pico-feature.
+ *
+ * @param p1, p2    protos to be compared
+ *
+ * Globals: none
+ *
+ * @return Worst possible result when matching p1 to p2.
+ */
+float CompareProtos(PROTO_STRUCT *p1, PROTO_STRUCT *p2) {
+  float WorstEvidence = WORST_EVIDENCE;
+  float Evidence;
+  float Angle, Length;
+
+  /* if p1 and p2 are not close in length, don't let them match */
+  Length = std::fabs(p1->Length - p2->Length);
+  if (Length > MAX_LENGTH_MISMATCH) {
+    return (0.0);
+  }
+
+  /* create a dummy pico-feature to be used for comparisons */
+  auto Feature = new FEATURE_STRUCT(&PicoFeatDesc);
+  Feature->Params[PicoFeatDir] = p1->Angle;
+
+  /* convert angle to radians */
+  Angle = p1->Angle * 2.0 * M_PI;
+
+  /* find distance from center of p1 to 1/2 picofeat from end */
+  Length = p1->Length / 2.0 - GetPicoFeatureLength() / 2.0;
+  if (Length < 0) {
+    Length = 0;
+  }
+
+  /* set the dummy pico-feature at one end of p1 and match it to p2 */
+  Feature->Params[PicoFeatX] = p1->X + std::cos(Angle) * Length;
+  Feature->Params[PicoFeatY] = p1->Y + std::sin(Angle) * Length;
+  if (DummyFastMatch(Feature, p2)) {
+    Evidence = SubfeatureEvidence(Feature, p2);
+    if (Evidence < WorstEvidence) {
+      WorstEvidence = Evidence;
+    }
+  } else {
+    delete Feature;
+    return 0.0;
+  }
+
+  /* set the dummy pico-feature at the other end of p1 and match it to p2 */
+  Feature->Params[PicoFeatX] = p1->X - std::cos(Angle) * Length;
+  Feature->Params[PicoFeatY] = p1->Y - std::sin(Angle) * Length;
+  if (DummyFastMatch(Feature, p2)) {
+    Evidence = SubfeatureEvidence(Feature, p2);
+    if (Evidence < WorstEvidence) {
+      WorstEvidence = Evidence;
+    }
+  } else {
+    delete Feature;
+    return 0.0;
+  }
+
+  delete Feature;
+  return (WorstEvidence);
+
+} /* CompareProtos */
+
+/**
+ * This routine computes a proto which is the weighted
+ * average of protos p1 and p2.  The new proto is returned
+ * in MergedProto.
+ *
+ * @param p1, p2    protos to be merged
+ * @param w1, w2    weight of each proto
+ * @param MergedProto place to put resulting merged proto
+ */
+void ComputeMergedProto(PROTO_STRUCT *p1, PROTO_STRUCT *p2, float w1, float w2, PROTO_STRUCT *MergedProto) {
+  float TotalWeight;
+
+  TotalWeight = w1 + w2;
+  w1 /= TotalWeight;
+  w2 /= TotalWeight;
+
+  MergedProto->X = p1->X * w1 + p2->X * w2;
+  MergedProto->Y = p1->Y * w1 + p2->Y * w2;
+  MergedProto->Length = p1->Length * w1 + p2->Length * w2;
+  MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
+  FillABC(MergedProto);
+} /* ComputeMergedProto */
+
+/**
+ * This routine searches through all of the prototypes in
+ * Class and returns the id of the proto which would provide
+ * the best approximation of Prototype.  If no close
+ * approximation can be found, NO_PROTO is returned.
+ *
+ * @param Class   class to search for matching old proto in
+ * @param NumMerged # of protos merged into each proto of Class
+ * @param  Prototype new proto to find match for
+ *
+ * Globals: none
+ *
+ * @return Id of closest proto in Class or NO_PROTO.
+ */
+int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype) {
+  PROTO_STRUCT NewProto;
+  PROTO_STRUCT MergedProto;
+  int Pid;
+  PROTO_STRUCT *Proto;
+  int BestProto;
+  float BestMatch;
+  float Match, OldMatch, NewMatch;
+
+  MakeNewFromOld(&NewProto, Prototype);
+
+  BestProto = NO_PROTO;
+  BestMatch = WORST_MATCH_ALLOWED;
+  for (Pid = 0; Pid < Class->NumProtos; Pid++) {
+    Proto = ProtoIn(Class, Pid);
+    ComputeMergedProto(Proto, &NewProto, static_cast<float>(NumMerged[Pid]), 1.0, &MergedProto);
+    OldMatch = CompareProtos(Proto, &MergedProto);
+    NewMatch = CompareProtos(&NewProto, &MergedProto);
+    Match = std::min(OldMatch, NewMatch);
+    if (Match > BestMatch) {
+      BestProto = Pid;
+      BestMatch = Match;
+    }
+  }
+  return BestProto;
+} /* FindClosestExistingProto */
+
+/**
+ * This fills in the fields of the New proto based on the
+ * fields of the Old proto.
+ *
+ * @param New new proto to be filled in
+ * @param Old old proto to be converted
+ *
+ *  Globals: none
+ */
+void MakeNewFromOld(PROTO_STRUCT *New, PROTOTYPE *Old) {
+  New->X = CenterX(Old->Mean);
+  New->Y = CenterY(Old->Mean);
+  New->Length = LengthOf(Old->Mean);
+  New->Angle = OrientationOf(Old->Mean);
+  FillABC(New);
+} /* MakeNewFromOld */
+
+/*-------------------once in subfeat---------------------------------*/
+
+/**
+ * @name SubfeatureEvidence
+ *
+ * Compare a feature to a prototype. Print the result.
+ */
+float SubfeatureEvidence(FEATURE Feature, PROTO_STRUCT *Proto) {
+  float Distance;
+  float Dangle;
+
+  Dangle = Proto->Angle - Feature->Params[PicoFeatDir];
+  if (Dangle < -0.5) {
+    Dangle += 1.0;
+  }
+  if (Dangle > 0.5) {
+    Dangle -= 1.0;
+  }
+  Dangle *= training_angle_match_scale;
+
+  Distance =
+      Proto->A * Feature->Params[PicoFeatX] + Proto->B * Feature->Params[PicoFeatY] + Proto->C;
+
+  return (EvidenceOf(Distance * Distance + Dangle * Dangle));
+}
+
+/**
+ * @name EvidenceOf
+ *
+ * Return the new type of evidence number corresponding to this
+ * distance value.  This number is no longer based on the chi squared
+ * approximation.  The equation that represents the transform is:
+ *       1 / (1 + (sim / midpoint) ^ curl)
+ */
+double EvidenceOf(double Similarity) {
+  Similarity /= training_similarity_midpoint;
+
+  if (training_similarity_curl == 3) {
+    Similarity = Similarity * Similarity * Similarity;
+  } else if (training_similarity_curl == 2) {
+    Similarity = Similarity * Similarity;
+  } else {
+    Similarity = pow(Similarity, training_similarity_curl);
+  }
+
+  return (1.0 / (1.0 + Similarity));
+}
+
+/**
+ * This routine returns true if Feature would be matched
+ * by a fast match table built from Proto.
+ *
+ * @param Feature   feature to be "fast matched" to proto
+ * @param Proto   proto being "fast matched" against
+ *
+ * Globals:
+ * - training_tangent_bbox_pad    bounding box pad tangent to proto
+ * - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
+ *
+ * @return true if feature could match Proto.
+ */
+bool DummyFastMatch(FEATURE Feature, PROTO_STRUCT *Proto) {
+  FRECT BoundingBox;
+  float MaxAngleError;
+  float AngleError;
+
+  MaxAngleError = training_angle_pad / 360.0;
+  AngleError = std::fabs(Proto->Angle - Feature->Params[PicoFeatDir]);
+  if (AngleError > 0.5) {
+    AngleError = 1.0 - AngleError;
+  }
+
+  if (AngleError > MaxAngleError) {
+    return false;
+  }
+
+  ComputePaddedBoundingBox(Proto, training_tangent_bbox_pad * GetPicoFeatureLength(),
+                           training_orthogonal_bbox_pad * GetPicoFeatureLength(), &BoundingBox);
+
+  return PointInside(&BoundingBox, Feature->Params[PicoFeatX], Feature->Params[PicoFeatY]);
+} /* DummyFastMatch */
+
+/**
+ * This routine computes a bounding box that encloses the
+ * specified proto along with some padding.  The
+ * amount of padding is specified as separate distances
+ * in the tangential and orthogonal directions.
+ *
+ * @param Proto   proto to compute bounding box for
+ * @param TangentPad  amount of pad to add in direction of segment
+ * @param OrthogonalPad amount of pad to add orthogonal to segment
+ * @param[out] BoundingBox place to put results
+ */
+void ComputePaddedBoundingBox(PROTO_STRUCT *Proto, float TangentPad, float OrthogonalPad,
+                              FRECT *BoundingBox) {
+  float Length = Proto->Length / 2.0 + TangentPad;
+  float Angle = Proto->Angle * 2.0 * M_PI;
+  float CosOfAngle = fabs(std::cos(Angle));
+  float SinOfAngle = fabs(std::sin(Angle));
+
+  float Pad = std::max(CosOfAngle * Length, SinOfAngle * OrthogonalPad);
+  BoundingBox->MinX = Proto->X - Pad;
+  BoundingBox->MaxX = Proto->X + Pad;
+
+  Pad = std::max(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
+  BoundingBox->MinY = Proto->Y - Pad;
+  BoundingBox->MaxY = Proto->Y + Pad;
+
+} /* ComputePaddedBoundingBox */
+
+/**
+ * Return true if point (X,Y) is inside of Rectangle.
+ *
+ * Globals: none
+ *
+ * @return true if point (X,Y) is inside of Rectangle.
+ */
+bool PointInside(FRECT *Rectangle, float X, float Y) {
+  return (X >= Rectangle->MinX) && (X <= Rectangle->MaxX) && (Y >= Rectangle->MinY) &&
+         (Y <= Rectangle->MaxY);
+} /* PointInside */