Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/linlsq.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/linlsq.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,259 @@ +/********************************************************************** + * File: linlsq.cpp (Formerly llsq.c) + * Description: Linear Least squares fitting code. + * Author: Ray Smith + * + * (C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "linlsq.h" +#include <cmath> // for std::sqrt +#include <cstdio> +#include "errcode.h" + +namespace tesseract { + +constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ"); + +/********************************************************************** + * LLSQ::clear + * + * Function to initialize a LLSQ. + **********************************************************************/ + +void LLSQ::clear() { // initialize + total_weight = 0.0; // no elements + sigx = 0.0; // update accumulators + sigy = 0.0; + sigxx = 0.0; + sigxy = 0.0; + sigyy = 0.0; +} + +/********************************************************************** + * LLSQ::add + * + * Add an element to the accumulator. + **********************************************************************/ + +void LLSQ::add(double x, double y) { // add an element + total_weight++; // count elements + sigx += x; // update accumulators + sigy += y; + sigxx += x * x; + sigxy += x * y; + sigyy += y * y; +} +// Adds an element with a specified weight. +void LLSQ::add(double x, double y, double weight) { + total_weight += weight; + sigx += x * weight; // update accumulators + sigy += y * weight; + sigxx += x * x * weight; + sigxy += x * y * weight; + sigyy += y * y * weight; +} +// Adds a whole LLSQ. +void LLSQ::add(const LLSQ &other) { + total_weight += other.total_weight; + sigx += other.sigx; // update accumulators + sigy += other.sigy; + sigxx += other.sigxx; + sigxy += other.sigxy; + sigyy += other.sigyy; +} + +/********************************************************************** + * LLSQ::remove + * + * Delete an element from the acculuator. + **********************************************************************/ + +void LLSQ::remove(double x, double y) { // delete an element + if (total_weight <= 0.0) { // illegal + EMPTY_LLSQ.error("LLSQ::remove", ABORT); + } + total_weight--; // count elements + sigx -= x; // update accumulators + sigy -= y; + sigxx -= x * x; + sigxy -= x * y; + sigyy -= y * y; +} + +/********************************************************************** + * LLSQ::m + * + * Return the gradient of the line fit. + **********************************************************************/ + +double LLSQ::m() const { // get gradient + double covar = covariance(); + double x_var = x_variance(); + if (x_var != 0.0) { + return covar / x_var; + } else { + return 0.0; // too little + } +} + +/********************************************************************** + * LLSQ::c + * + * Return the constant of the line fit. + **********************************************************************/ + +double LLSQ::c(double m) const { // get constant + if (total_weight > 0.0) { + return (sigy - m * sigx) / total_weight; + } else { + return 0; // too little + } +} + +/********************************************************************** + * LLSQ::rms + * + * Return the rms error of the fit. + **********************************************************************/ + +double LLSQ::rms(double m, double c) const { // get error + double error; // total error + + if (total_weight > 0) { + error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy); + if (error >= 0) { + error = std::sqrt(error / total_weight); // sqrt of mean + } else { + error = 0; + } + } else { + error = 0; // too little + } + return error; +} + +/********************************************************************** + * LLSQ::pearson + * + * Return the pearson product moment correlation coefficient. + **********************************************************************/ + +double LLSQ::pearson() const { // get correlation + double r = 0.0; // Correlation is 0 if insufficient data. + + double covar = covariance(); + if (covar != 0.0) { + double var_product = x_variance() * y_variance(); + if (var_product > 0.0) { + r = covar / std::sqrt(var_product); + } + } + return r; +} + +// Returns the x,y means as an FCOORD. +FCOORD LLSQ::mean_point() const { + if (total_weight > 0.0) { + return FCOORD(sigx / total_weight, sigy / total_weight); + } else { + return FCOORD(0.0f, 0.0f); + } +} + +// Returns the sqrt of the mean squared error measured perpendicular from the +// line through mean_point() in the direction dir. +// +// Derivation: +// Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices). +// Let % be dot product and ' be transpose. Note that: +// Sum[i=1..N] (v % x_i)^2 +// = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v' +// If x_i have average 0 we have: +// = v * (N * COVARIANCE_MATRIX(X)) * v' +// Expanded for the case that k = 2, where we treat the dimensions +// as x_i and y_i, this is: +// = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v' +// Now, we are trying to calculate the mean squared error, where v is +// perpendicular to our line of interest: +// Mean squared error +// = E [ (v % (x_i - x_avg))) ^2 ] +// = Sum (v % (x_i - x_avg))^2 / N +// = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v' +// = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v' +// = code below +double LLSQ::rms_orth(const FCOORD &dir) const { + FCOORD v = !dir; + v.normalise(); + return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() + + y_variance() * v.y() * v.y()); +} + +// Returns the direction of the fitted line as a unit vector, using the +// least mean squared perpendicular distance. The line runs through the +// mean_point, i.e. a point p on the line is given by: +// p = mean_point() + lambda * vector_fit() for some real number lambda. +// Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous +// and may be negated without changing its meaning. +// Fitting a line m + ๐v to a set of N points Pi = (xi, yi), where +// m is the mean point (๐, ๐) and +// v is the direction vector (cos๐, sin๐) +// The perpendicular distance of each Pi from the line is: +// (Pi - m) x v, where x is the scalar cross product. +// Total squared error is thus: +// E = โ((xi - ๐)sin๐ - (yi - ๐)cos๐)ยฒ +// = โ(xi - ๐)ยฒsinยฒ๐ - 2โ(xi - ๐)(yi - ๐)sin๐ cos๐ + โ(yi - ๐)ยฒcosยฒ๐ +// = NVar(xi)sinยฒ๐ - 2NCovar(xi, yi)sin๐ cos๐ + NVar(yi)cosยฒ๐ (Eq 1) +// where Var(xi) is the variance of xi, +// and Covar(xi, yi) is the covariance of xi, yi. +// Taking the derivative wrt ๐ and setting to 0 to obtain the min/max: +// 0 = 2NVar(xi)sin๐ cos๐ -2NCovar(xi, yi)(cosยฒ๐ - sinยฒ๐) -2NVar(yi)sin๐ cos๐ +// => Covar(xi, yi)(cosยฒ๐ - sinยฒ๐) = (Var(xi) - Var(yi))sin๐ cos๐ +// Using double angles: +// 2Covar(xi, yi)cos2๐ = (Var(xi) - Var(yi))sin2๐ (Eq 2) +// So ๐ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3) + +// Because it involves 2๐ , Eq 2 has 2 solutions 90 degrees apart, but which +// is the min and which is the max? From Eq1: +// E/N = Var(xi)sinยฒ๐ - 2Covar(xi, yi)sin๐ cos๐ + Var(yi)cosยฒ๐ +// and 90 degrees away, using sin/cos equivalences: +// E'/N = Var(xi)cosยฒ๐ + 2Covar(xi, yi)sin๐ cos๐ + Var(yi)sinยฒ๐ +// The second error is smaller (making it the minimum) iff +// E'/N < E/N ie: +// (Var(xi) - Var(yi))(cosยฒ๐ - sinยฒ๐) < -4Covar(xi, yi)sin๐ cos๐ +// Using double angles: +// (Var(xi) - Var(yi))cos2๐ < -2Covar(xi, yi)sin2๐ (InEq 1) +// But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐ such that: +// sgn(cos2๐) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐) = sgn(Covar(xi, yi)) +// so InEq1 can *never* be true, making the atan2 result *always* the min! +// In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi), +// the 2 solutions have equal error and the inequality is still false. +// Therefore the solution really is as trivial as Eq 3. + +// This is equivalent to returning the Principal Component in PCA, or the +// eigenvector corresponding to the largest eigenvalue in the covariance +// matrix. However, atan2 is much simpler! The one reference I found that +// uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but +// that is still a much more complex derivation. It seems Pearson had already +// found this simple solution in 1901. +// http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559 +FCOORD LLSQ::vector_fit() const { + double x_var = x_variance(); + double y_var = y_variance(); + double covar = covariance(); + double theta = 0.5 * atan2(2.0 * covar, x_var - y_var); + FCOORD result(cos(theta), sin(theta)); + return result; +} + +} // namespace tesseract
