Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/classify/intfx.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/classify/intfx.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/******************************************************************************
+** Filename:    intfx.c
+** Purpose:     Integer character normalization & feature extraction
+** Author:      Robert Moss, rays@google.com (Ray Smith)
+**
+** (c) Copyright Hewlett-Packard Company, 1988.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*****************************************************************************/
+/**----------------------------------------------------------------------------
+Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#define _USE_MATH_DEFINES // for M_PI
+#include "intfx.h"
+#include "classify.h"
+#include "intmatcher.h"
+#include "linlsq.h"
+#include "normalis.h"
+#include "statistc.h"
+#include "trainingsample.h"
+#include "helpers.h"
+#include <allheaders.h>
+#include <cmath> // for M_PI
+#include <mutex> // for std::mutex
+namespace tesseract {
+/**----------------------------------------------------------------------------
+Global Data Definitions and Declarations
+----------------------------------------------------------------------------**/
+// Look up table for cos and sin to turn the intfx feature angle to a vector.
+// Protected by atan_table_mutex.
+// The entries are in binary degrees where a full circle is 256 binary degrees.
+static float cos_table[INT_CHAR_NORM_RANGE];
+static float sin_table[INT_CHAR_NORM_RANGE];
+/**----------------------------------------------------------------------------
+Public Code
+----------------------------------------------------------------------------**/
+void InitIntegerFX() {
+// Guards write access to AtanTable so we don't create it more than once.
+static std::mutex atan_table_mutex;
+static bool atan_table_init = false;
+std::lock_guard<std::mutex> guard(atan_table_mutex);
+if (!atan_table_init) {
+for (int i = 0; i < INT_CHAR_NORM_RANGE; ++i) {
+cos_table[i] = cos(i * 2 * M_PI / INT_CHAR_NORM_RANGE + M_PI);
+sin_table[i] = sin(i * 2 * M_PI / INT_CHAR_NORM_RANGE + M_PI);
+}
+atan_table_init = true;
+}
+}
+// Returns a vector representing the direction of a feature with the given
+// theta direction in an INT_FEATURE_STRUCT.
+FCOORD FeatureDirection(uint8_t theta) {
+return FCOORD(cos_table[theta], sin_table[theta]);
+}
+// Generates a TrainingSample from a TBLOB. Extracts features and sets
+// the bounding box, so classifiers that operate on the image can work.
+// TODO(rays) Make BlobToTrainingSample a member of Classify now that
+// the FlexFx and FeatureDescription code have been removed and LearnBlob
+// is now a member of Classify.
+TrainingSample *BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm,
+INT_FX_RESULT_STRUCT *fx_info,
+std::vector<INT_FEATURE_STRUCT> *bl_features) {
+std::vector<INT_FEATURE_STRUCT> cn_features;
+Classify::ExtractFeatures(blob, nonlinear_norm, bl_features, &cn_features, fx_info, nullptr);
+// TODO(rays) Use blob->PreciseBoundingBox() instead.
+TBOX box = blob.bounding_box();
+TrainingSample *sample = nullptr;
+int num_features = fx_info->NumCN;
+if (num_features > 0) {
+sample = TrainingSample::CopyFromFeatures(*fx_info, box, &cn_features[0], num_features);
+}
+if (sample != nullptr) {
+// Set the bounding box (in original image coordinates) in the sample.
+TPOINT topleft, botright;
+topleft.x = box.left();
+topleft.y = box.top();
+botright.x = box.right();
+botright.y = box.bottom();
+TPOINT original_topleft, original_botright;
+blob.denorm().DenormTransform(nullptr, topleft, &original_topleft);
+blob.denorm().DenormTransform(nullptr, botright, &original_botright);
+sample->set_bounding_box(
+TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y));
+}
+return sample;
+}
+// Computes the DENORMS for bl(baseline) and cn(character) normalization
+// during feature extraction. The input denorm describes the current state
+// of the blob, which is usually a baseline-normalized word.
+// The Transforms setup are as follows:
+// Baseline Normalized (bl) Output:
+//   We center the grapheme by aligning the x-coordinate of its centroid with
+//   x=128 and leaving the already-baseline-normalized y as-is.
+//
+// Character Normalized (cn) Output:
+//   We align the grapheme's centroid at the origin and scale it
+//   asymmetrically in x and y so that the 2nd moments are a standard value
+//   (51.2) ie the result is vaguely square.
+// If classify_nonlinear_norm is true:
+//   A non-linear normalization is setup that attempts to evenly distribute
+//   edges across x and y.
+//
+// Some of the fields of fx_info are also setup:
+// Length: Total length of outline.
+// Rx:     Rounded y second moment. (Reversed by convention.)
+// Ry:     rounded x second moment.
+// Xmean:  Rounded x center of mass of the blob.
+// Ymean:  Rounded y center of mass of the blob.
+void Classify::SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm,
+DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info) {
+// Compute 1st and 2nd moments of the original outline.
+FCOORD center, second_moments;
+int length = blob.ComputeMoments(&center, &second_moments);
+if (fx_info != nullptr) {
+fx_info->Length = length;
+fx_info->Rx = IntCastRounded(second_moments.y());
+fx_info->Ry = IntCastRounded(second_moments.x());
+fx_info->Xmean = IntCastRounded(center.x());
+fx_info->Ymean = IntCastRounded(center.y());
+}
+// Setup the denorm for Baseline normalization.
+bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f, 1.0f, 1.0f,
+128.0f, 128.0f);
+// Setup the denorm for character normalization.
+if (nonlinear_norm) {
+std::vector<std::vector<int>> x_coords;
+std::vector<std::vector<int>> y_coords;
+TBOX box;
+blob.GetPreciseBoundingBox(&box);
+box.pad(1, 1);
+blob.GetEdgeCoords(box, x_coords, y_coords);
+cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX, 0.0f, 0.0f, x_coords,
+y_coords);
+} else {
+cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), center.y(),
+51.2f / second_moments.x(), 51.2f / second_moments.y(), 128.0f,
+128.0f);
+}
+}
+// Helper normalizes the direction, assuming that it is at the given
+// unnormed_pos, using the given denorm, starting at the root_denorm.
+static uint8_t NormalizeDirection(uint8_t dir, const FCOORD &unnormed_pos, const DENORM &denorm,
+const DENORM *root_denorm) {
+// Convert direction to a vector.
+FCOORD unnormed_end;
+unnormed_end.from_direction(dir);
+unnormed_end += unnormed_pos;
+FCOORD normed_pos, normed_end;
+denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);
+denorm.NormTransform(root_denorm, unnormed_end, &normed_end);
+normed_end -= normed_pos;
+return normed_end.to_direction();
+}
+// Helper returns the mean direction vector from the given stats. Use the
+// mean direction from dirs if there is information available, otherwise, use
+// the fit_vector from point_diffs.
+static FCOORD MeanDirectionVector(const LLSQ &point_diffs, const LLSQ &dirs, const FCOORD &start_pt,
+const FCOORD &end_pt) {
+FCOORD fit_vector;
+if (dirs.count() > 0) {
+// There were directions, so use them. To avoid wrap-around problems, we
+// have 2 accumulators in dirs: x for normal directions and y for
+// directions offset by 128. We will use the one with the least variance.
+FCOORD mean_pt = dirs.mean_point();
+double mean_dir = 0.0;
+if (dirs.x_variance() <= dirs.y_variance()) {
+mean_dir = mean_pt.x();
+} else {
+mean_dir = mean_pt.y() + 128;
+}
+fit_vector.from_direction(Modulo(IntCastRounded(mean_dir), 256));
+} else {
+// There were no directions, so we rely on the vector_fit to the points.
+// Since the vector_fit is 180 degrees ambiguous, we align with the
+// supplied feature_dir by making the scalar product non-negative.
+FCOORD feature_dir(end_pt - start_pt);
+fit_vector = point_diffs.vector_fit();
+if (fit_vector.x() == 0.0f && fit_vector.y() == 0.0f) {
+// There was only a single point. Use feature_dir directly.
+fit_vector = feature_dir;
+} else {
+// Sometimes the least mean squares fit is wrong, due to the small sample
+// of points and scaling. Use a 90 degree rotated vector if that matches
+// feature_dir better.
+FCOORD fit_vector2 = !fit_vector;
+// The fit_vector is 180 degrees ambiguous, so resolve the ambiguity by
+// insisting that the scalar product with the feature_dir should be +ve.
+if (fit_vector % feature_dir < 0.0) {
+fit_vector = -fit_vector;
+}
+if (fit_vector2 % feature_dir < 0.0) {
+fit_vector2 = -fit_vector2;
+}
+// Even though fit_vector2 has a higher mean squared error, it might be
+// a better fit, so use it if the dot product with feature_dir is bigger.
+if (fit_vector2 % feature_dir > fit_vector % feature_dir) {
+fit_vector = fit_vector2;
+}
+}
+}
+return fit_vector;
+}
+// Helper computes one or more features corresponding to the given points.
+// Emitted features are on the line defined by:
+// start_pt + lambda * (end_pt - start_pt) for scalar lambda.
+// Features are spaced at feature_length intervals.
+static int ComputeFeatures(const FCOORD &start_pt, const FCOORD &end_pt, double feature_length,
+std::vector<INT_FEATURE_STRUCT> *features) {
+FCOORD feature_vector(end_pt - start_pt);
+if (feature_vector.x() == 0.0f && feature_vector.y() == 0.0f) {
+return 0;
+}
+// Compute theta for the feature based on its direction.
+uint8_t theta = feature_vector.to_direction();
+// Compute the number of features and lambda_step.
+double target_length = feature_vector.length();
+int num_features = IntCastRounded(target_length / feature_length);
+if (num_features == 0) {
+return 0;
+}
+// Divide the length evenly into num_features pieces.
+double lambda_step = 1.0 / num_features;
+double lambda = lambda_step / 2.0;
+for (int f = 0; f < num_features; ++f, lambda += lambda_step) {
+FCOORD feature_pt(start_pt);
+feature_pt += feature_vector * lambda;
+INT_FEATURE_STRUCT feature(feature_pt, theta);
+features->push_back(feature);
+}
+return num_features;
+}
+// Gathers outline points and their directions from start_index into dirs by
+// stepping along the outline and normalizing the coordinates until the
+// required feature_length has been collected or end_index is reached.
+// On input pos must point to the position corresponding to start_index and on
+// return pos is updated to the current raw position, and pos_normed is set to
+// the normed version of pos.
+// Since directions wrap-around, they need special treatment to get the mean.
+// Provided the cluster of directions doesn't straddle the wrap-around point,
+// the simple mean works. If they do, then, unless the directions are wildly
+// varying, the cluster rotated by 180 degrees will not straddle the wrap-
+// around point, so mean(dir + 180 degrees) - 180 degrees will work. Since
+// LLSQ conveniently stores the mean of 2 variables, we use it to store
+// dir and dir+128 (128 is 180 degrees) and then use the resulting mean
+// with the least variance.
+static int GatherPoints(const C_OUTLINE *outline, double feature_length, const DENORM &denorm,
+const DENORM *root_denorm, int start_index, int end_index, ICOORD *pos,
+FCOORD *pos_normed, LLSQ *points, LLSQ *dirs) {
+int step_length = outline->pathlength();
+ICOORD step = outline->step(start_index % step_length);
+// Prev_normed is the start point of this collection and will be set on the
+// first iteration, and on later iterations used to determine the length
+// that has been collected.
+FCOORD prev_normed;
+points->clear();
+dirs->clear();
+int num_points = 0;
+int index;
+for (index = start_index; index <= end_index; ++index, *pos += step) {
+step = outline->step(index % step_length);
+int edge_weight = outline->edge_strength_at_index(index % step_length);
+if (edge_weight == 0) {
+// This point has conflicting gradient and step direction, so ignore it.
+continue;
+}
+// Get the sub-pixel precise location and normalize.
+FCOORD f_pos = outline->sub_pixel_pos_at_index(*pos, index % step_length);
+denorm.NormTransform(root_denorm, f_pos, pos_normed);
+if (num_points == 0) {
+// The start of this segment.
+prev_normed = *pos_normed;
+} else {
+FCOORD offset = *pos_normed - prev_normed;
+float length = offset.length();
+if (length > feature_length) {
+// We have gone far enough from the start. We will use this point in
+// the next set so return what we have so far.
+return index;
+}
+}
+points->add(pos_normed->x(), pos_normed->y(), edge_weight);
+int direction = outline->direction_at_index(index % step_length);
+if (direction >= 0) {
+direction = NormalizeDirection(direction, f_pos, denorm, root_denorm);
+// Use both the direction and direction +128 so we are not trying to
+// take the mean of something straddling the wrap-around point.
+dirs->add(direction, Modulo(direction + 128, 256));
+}
+++num_points;
+}
+return index;
+}
+// Extracts Tesseract features and appends them to the features vector.
+// Startpt to lastpt, inclusive, MUST have the same src_outline member,
+// which may be nullptr. The vector from lastpt to its next is included in
+// the feature extraction. Hidden edges should be excluded by the caller.
+// If force_poly is true, the features will be extracted from the polygonal
+// approximation even if more accurate data is available.
+static void ExtractFeaturesFromRun(const EDGEPT *startpt, const EDGEPT *lastpt,
+const DENORM &denorm, double feature_length, bool force_poly,
+std::vector<INT_FEATURE_STRUCT> *features) {
+const EDGEPT *endpt = lastpt->next;
+const C_OUTLINE *outline = startpt->src_outline;
+if (outline != nullptr && !force_poly) {
+// Detailed information is available. We have to normalize only from
+// the root_denorm to denorm.
+const DENORM *root_denorm = denorm.RootDenorm();
+int total_features = 0;
+// Get the features from the outline.
+int step_length = outline->pathlength();
+int start_index = startpt->start_step;
+// pos is the integer coordinates of the binary image steps.
+ICOORD pos = outline->position_at_index(start_index);
+// We use an end_index that allows us to use a positive increment, but that
+// may be beyond the bounds of the outline steps/ due to wrap-around, to
+// so we use % step_length everywhere, except for start_index.
+int end_index = lastpt->start_step + lastpt->step_count;
+if (end_index <= start_index) {
+end_index += step_length;
+}
+LLSQ prev_points;
+LLSQ prev_dirs;
+FCOORD prev_normed_pos = outline->sub_pixel_pos_at_index(pos, start_index);
+denorm.NormTransform(root_denorm, prev_normed_pos, &prev_normed_pos);
+LLSQ points;
+LLSQ dirs;
+FCOORD normed_pos(0.0f, 0.0f);
+int index = GatherPoints(outline, feature_length, denorm, root_denorm, start_index, end_index,
+&pos, &normed_pos, &points, &dirs);
+while (index <= end_index) {
+// At each iteration we nominally have 3 accumulated sets of points and
+// dirs: prev_points/dirs, points/dirs, next_points/dirs and sum them
+// into sum_points/dirs, but we don't necessarily get any features out,
+// so if that is the case, we keep accumulating instead of rotating the
+// accumulators.
+LLSQ next_points;
+LLSQ next_dirs;
+FCOORD next_normed_pos(0.0f, 0.0f);
+index = GatherPoints(outline, feature_length, denorm, root_denorm, index, end_index, &pos,
+&next_normed_pos, &next_points, &next_dirs);
+LLSQ sum_points(prev_points);
+// TODO(rays) find out why it is better to use just dirs and next_dirs
+// in sum_dirs, instead of using prev_dirs as well.
+LLSQ sum_dirs(dirs);
+sum_points.add(points);
+sum_points.add(next_points);
+sum_dirs.add(next_dirs);
+bool made_features = false;
+// If we have some points, we can try making some features.
+if (sum_points.count() > 0) {
+// We have gone far enough from the start. Make a feature and restart.
+FCOORD fit_pt = sum_points.mean_point();
+FCOORD fit_vector = MeanDirectionVector(sum_points, sum_dirs, prev_normed_pos, normed_pos);
+// The segment to which we fit features is the line passing through
+// fit_pt in direction of fit_vector that starts nearest to
+// prev_normed_pos and ends nearest to normed_pos.
+FCOORD start_pos = prev_normed_pos.nearest_pt_on_line(fit_pt, fit_vector);
+FCOORD end_pos = normed_pos.nearest_pt_on_line(fit_pt, fit_vector);
+// Possible correction to match the adjacent polygon segment.
+if (total_features == 0 && startpt != endpt) {
+FCOORD poly_pos(startpt->pos.x, startpt->pos.y);
+denorm.LocalNormTransform(poly_pos, &start_pos);
+}
+if (index > end_index && startpt != endpt) {
+FCOORD poly_pos(endpt->pos.x, endpt->pos.y);
+denorm.LocalNormTransform(poly_pos, &end_pos);
+}
+int num_features = ComputeFeatures(start_pos, end_pos, feature_length, features);
+if (num_features > 0) {
+// We made some features so shuffle the accumulators.
+prev_points = points;
+prev_dirs = dirs;
+prev_normed_pos = normed_pos;
+points = next_points;
+dirs = next_dirs;
+made_features = true;
+total_features += num_features;
+}
+// The end of the next set becomes the end next time around.
+normed_pos = next_normed_pos;
+}
+if (!made_features) {
+// We didn't make any features, so keep the prev accumulators and
+// add the next ones into the current.
+points.add(next_points);
+dirs.add(next_dirs);
+}
+}
+} else {
+// There is no outline, so we are forced to use the polygonal approximation.
+const EDGEPT *pt = startpt;
+do {
+FCOORD start_pos(pt->pos.x, pt->pos.y);
+FCOORD end_pos(pt->next->pos.x, pt->next->pos.y);
+denorm.LocalNormTransform(start_pos, &start_pos);
+denorm.LocalNormTransform(end_pos, &end_pos);
+ComputeFeatures(start_pos, end_pos, feature_length, features);
+} while ((pt = pt->next) != endpt);
+}
+}
+// Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
+// (x,y) position and angle as measured counterclockwise from the vector
+// <-1, 0>, from blob using two normalizations defined by bl_denorm and
+// cn_denorm. See SetpuBLCNDenorms for definitions.
+// If outline_cn_counts is not nullptr, on return it contains the cumulative
+// number of cn features generated for each outline in the blob (in order).
+// Thus after the first outline, there were (*outline_cn_counts)[0] features,
+// after the second outline, there were (*outline_cn_counts)[1] features etc.
+void Classify::ExtractFeatures(const TBLOB &blob, bool nonlinear_norm,
+std::vector<INT_FEATURE_STRUCT> *bl_features,
+std::vector<INT_FEATURE_STRUCT> *cn_features,
+INT_FX_RESULT_STRUCT *results,
+std::vector<int> *outline_cn_counts) {
+DENORM bl_denorm, cn_denorm;
+tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm, &bl_denorm, &cn_denorm, results);
+if (outline_cn_counts != nullptr) {
+outline_cn_counts->clear();
+}
+// Iterate the outlines.
+for (TESSLINE *ol = blob.outlines; ol != nullptr; ol = ol->next) {
+// Iterate the polygon.
+EDGEPT *loop_pt = ol->FindBestStartPt();
+EDGEPT *pt = loop_pt;
+if (pt == nullptr) {
+continue;
+}
+do {
+if (pt->IsHidden()) {
+continue;
+}
+// Find a run of equal src_outline.
+EDGEPT *last_pt = pt;
+do {
+last_pt = last_pt->next;
+} while (last_pt != loop_pt && !last_pt->IsHidden() &&
+last_pt->src_outline == pt->src_outline);
+last_pt = last_pt->prev;
+// Until the adaptive classifier can be weaned off polygon segments,
+// we have to force extraction from the polygon for the bl_features.
+ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength, true, bl_features);
+ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength, false, cn_features);
+pt = last_pt;
+} while ((pt = pt->next) != loop_pt);
+if (outline_cn_counts != nullptr) {
+outline_cn_counts->push_back(cn_features->size());
+}
+}
+results->NumBL = bl_features->size();
+results->NumCN = cn_features->size();
+results->YBottom = blob.bounding_box().bottom();
+results->YTop = blob.bounding_box().top();
+results->Width = blob.bounding_box().width();
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/classify/intfx.cpp @ 2:b50eed0cc0ef upstream