Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/******************************************************************************
+** Filename:    normmatch.c
+** Purpose:     Simple matcher based on character normalization features.
+** Author:      Dan Johnson
+**
+** (c) Copyright Hewlett-Packard Company, 1988.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+******************************************************************************/
+/*----------------------------------------------------------------------------
+Include Files and Type Defines
+----------------------------------------------------------------------------*/
+#include "normmatch.h"
+#include "classify.h"
+#include "clusttool.h"
+#include "helpers.h"
+#include "normfeat.h"
+#include "params.h"
+#include "unicharset.h"
+#include <cmath>
+#include <cstdio>
+#include <sstream> // for std::istringstream
+namespace tesseract {
+struct NORM_PROTOS {
+NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {
+}
+int NumParams = 0;
+int NumProtos;
+PARAM_DESC *ParamDesc = nullptr;
+std::vector<LIST> Protos;
+};
+/*----------------------------------------------------------------------------
+Private Code
+----------------------------------------------------------------------------*/
+/**
+* @name NormEvidenceOf
+*
+* Return the new type of evidence number corresponding to this
+* normalization adjustment.  The equation that represents the transform is:
+*       1 / (1 + (NormAdj / midpoint) ^ curl)
+*/
+static float NormEvidenceOf(float NormAdj) {
+NormAdj /= static_cast<float>(classify_norm_adj_midpoint);
+if (classify_norm_adj_curl == 3) {
+NormAdj = NormAdj * NormAdj * NormAdj;
+} else if (classify_norm_adj_curl == 2) {
+NormAdj = NormAdj * NormAdj;
+} else {
+NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl));
+}
+return (1 / (1 + NormAdj));
+}
+/*----------------------------------------------------------------------------
+Variables
+----------------------------------------------------------------------------*/
+/** control knobs used to control the normalization adjustment process */
+double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
+double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
+/** Weight of width variance against height and vertical position. */
+const float kWidthErrorWeighting = 0.125f;
+/*----------------------------------------------------------------------------
+Public Code
+----------------------------------------------------------------------------*/
+/**
+* This routine compares Features against each character
+* normalization proto for ClassId and returns the match
+* rating of the best match.
+* @param ClassId id of class to match against
+* @param feature character normalization feature
+* @param DebugMatch controls dump of debug info
+*
+* Globals:
+* #NormProtos character normalization prototypes
+*
+* @return Best match rating for Feature against protos of ClassId.
+*/
+float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {
+if (ClassId >= NormProtos->NumProtos) {
+ClassId = NO_CLASS;
+}
+/* handle requests for classification as noise */
+if (ClassId == NO_CLASS) {
+/* kludge - clean up constants and make into control knobs later */
+float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
+feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
+feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
+return (1 - NormEvidenceOf(Match));
+}
+if (DebugMatch) {
+tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
+}
+LIST Protos = NormProtos->Protos[ClassId];
+if (Protos == nullptr) {
+// Avoid FP overflow in NormEvidenceOf.
+return 1.0f;
+}
+float BestMatch = FLT_MAX;
+iterate(Protos) {
+auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
+float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
+float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
+if (DebugMatch) {
+tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
+Proto->Weight.Elliptical[CharNormY], Match);
+}
+Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
+Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
+if (DebugMatch) {
+tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
+Proto->Weight.Elliptical[CharNormRx], Match);
+}
+// Ry is width! See intfx.cpp.
+Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
+if (DebugMatch) {
+tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
+Proto->Weight.Elliptical[CharNormRy]);
+}
+Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
+Delta *= kWidthErrorWeighting;
+Match += Delta;
+if (DebugMatch) {
+tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
+Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
+256 * (1 - NormEvidenceOf(Match)));
+}
+if (Match < BestMatch) {
+BestMatch = Match;
+}
+}
+return 1 - NormEvidenceOf(BestMatch);
+} /* ComputeNormMatch */
+void Classify::FreeNormProtos() {
+if (NormProtos != nullptr) {
+for (int i = 0; i < NormProtos->NumProtos; i++) {
+FreeProtoList(&NormProtos->Protos[i]);
+}
+delete[] NormProtos->ParamDesc;
+delete NormProtos;
+NormProtos = nullptr;
+}
+}
+/**
+* This routine allocates a new data structure to hold
+* a set of character normalization protos.  It then fills in
+* the data structure by reading from the specified File.
+* @param fp open text file to read normalization protos from
+* Globals: none
+* @return Character normalization protos.
+*/
+NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
+char unichar[2 * UNICHAR_LEN + 1];
+UNICHAR_ID unichar_id;
+LIST Protos;
+int NumProtos;
+/* allocate and initialization data structure */
+auto NormProtos = new NORM_PROTOS(unicharset.size());
+/* read file header and save in data structure */
+NormProtos->NumParams = ReadSampleSize(fp);
+NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
+/* read protos for each class into a separate list */
+const int kMaxLineSize = 100;
+char line[kMaxLineSize];
+while (fp->FGets(line, kMaxLineSize) != nullptr) {
+std::istringstream stream(line);
+stream.imbue(std::locale::classic());
+stream >> unichar >> NumProtos;
+if (stream.fail()) {
+continue;
+}
+if (unicharset.contains_unichar(unichar)) {
+unichar_id = unicharset.unichar_to_id(unichar);
+Protos = NormProtos->Protos[unichar_id];
+for (int i = 0; i < NumProtos; i++) {
+Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
+}
+NormProtos->Protos[unichar_id] = Protos;
+} else {
+tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
+for (int i = 0; i < NumProtos; i++) {
+FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
+}
+}
+}
+return NormProtos;
+} /* ReadNormProtos */
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp @ 2:b50eed0cc0ef upstream