Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/detlinefit.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: detlinefit.h | |
| 3 // Description: Deterministic least upper-quartile squares line fitting. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2008, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_CCSTRUCT_DETLINEFIT_H_ | |
| 20 #define TESSERACT_CCSTRUCT_DETLINEFIT_H_ | |
| 21 | |
| 22 #include "kdpair.h" | |
| 23 #include "points.h" | |
| 24 | |
| 25 namespace tesseract { | |
| 26 | |
| 27 // This class fits a line to a set of ICOORD points. | |
| 28 // There is no restriction on the direction of the line, as it | |
| 29 // uses a vector method, ie no concern over infinite gradients. | |
| 30 // The fitted line has the least upper quartile of squares of perpendicular | |
| 31 // distances of all source points from the line, subject to the constraint | |
| 32 // that the line is made from one of the pairs of [{p1,p2,p3},{pn-2, pn-1, pn}] | |
| 33 // i.e. the 9 combinations of one of the first 3 and last 3 points. | |
| 34 // A fundamental assumption of this algorithm is that one of the first 3 and | |
| 35 // one of the last 3 points are near the best line fit. | |
| 36 // The points must be Added in line order for the algorithm to work properly. | |
| 37 // No floating point calculations are needed* to make an accurate fit, | |
| 38 // and no random numbers are needed** so the algorithm is deterministic, | |
| 39 // architecture-stable, and compiler-stable as well as stable to minor | |
| 40 // changes in the input. | |
| 41 // *A single floating point division is used to compute each line's distance. | |
| 42 // This is unlikely to result in choice of a different line, but if it does, | |
| 43 // it would be easy to replace with a 64 bit integer calculation. | |
| 44 // **Random numbers are used in the nth_item function, but the worst | |
| 45 // non-determinism that can result is picking a different result among equals, | |
| 46 // and that wouldn't make any difference to the end-result distance, so the | |
| 47 // randomness does not affect the determinism of the algorithm. The random | |
| 48 // numbers are only there to guarantee average linear time. | |
| 49 // Fitting time is linear, but with a high constant, as it tries 9 different | |
| 50 // lines and computes the distance of all points each time. | |
| 51 // This class is aimed at replacing the LLSQ (linear least squares) and | |
| 52 // LMS (least median of squares) classes that are currently used for most | |
| 53 // of the line fitting in Tesseract. | |
| 54 class DetLineFit { | |
| 55 public: | |
| 56 DetLineFit(); | |
| 57 ~DetLineFit() = default; | |
| 58 | |
| 59 // Delete all Added points. | |
| 60 void Clear(); | |
| 61 | |
| 62 // Adds a new point. Takes a copy - the pt doesn't need to stay in scope. | |
| 63 // Add must be called on points in sequence along the line. | |
| 64 void Add(const ICOORD &pt); | |
| 65 // Associates a half-width with the given point if a point overlaps the | |
| 66 // previous point by more than half the width, and its distance is further | |
| 67 // than the previous point, then the more distant point is ignored in the | |
| 68 // distance calculation. Useful for ignoring i dots and other diacritics. | |
| 69 void Add(const ICOORD &pt, int halfwidth); | |
| 70 | |
| 71 // Fits a line to the points, returning the fitted line as a pair of | |
| 72 // points, and the upper quartile error. | |
| 73 double Fit(ICOORD *pt1, ICOORD *pt2) { | |
| 74 return Fit(0, 0, pt1, pt2); | |
| 75 } | |
| 76 // Fits a line to the points, ignoring the skip_first initial points and the | |
| 77 // skip_last final points, returning the fitted line as a pair of points, | |
| 78 // and the upper quartile error. | |
| 79 double Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2); | |
| 80 | |
| 81 // Constrained fit with a supplied direction vector. Finds the best line_pt, | |
| 82 // that is one of the supplied points having the median cross product with | |
| 83 // direction, ignoring points that have a cross product outside of the range | |
| 84 // [min_dist, max_dist]. Returns the resulting error metric using the same | |
| 85 // reduced set of points. | |
| 86 // *Makes use of floating point arithmetic* | |
| 87 double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, | |
| 88 ICOORD *line_pt); | |
| 89 | |
| 90 // Returns true if there were enough points at the last call to Fit or | |
| 91 // ConstrainedFit for the fitted points to be used on a badly fitted line. | |
| 92 bool SufficientPointsForIndependentFit() const; | |
| 93 | |
| 94 // Backwards compatible fit returning a gradient and constant. | |
| 95 // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this | |
| 96 // function in preference to the LMS class. | |
| 97 double Fit(float *m, float *c); | |
| 98 | |
| 99 // Backwards compatible constrained fit with a supplied gradient. | |
| 100 // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible | |
| 101 // to avoid potential difficulties with infinite gradients. | |
| 102 double ConstrainedFit(double m, float *c); | |
| 103 | |
| 104 private: | |
| 105 // Simple struct to hold an ICOORD point and a halfwidth representing half | |
| 106 // the "width" (supposedly approximately parallel to the direction of the | |
| 107 // line) of each point, such that distant points can be discarded when they | |
| 108 // overlap nearer points. (Think i dot and other diacritics or noise.) | |
| 109 struct PointWidth { | |
| 110 PointWidth() : pt(ICOORD(0, 0)), halfwidth(0) {} | |
| 111 PointWidth(const ICOORD &pt0, int halfwidth0) : pt(pt0), halfwidth(halfwidth0) {} | |
| 112 | |
| 113 ICOORD pt; | |
| 114 int halfwidth; | |
| 115 }; | |
| 116 // Type holds the distance of each point from the fitted line and the point | |
| 117 // itself. Use of double allows integer distances from ICOORDs to be stored | |
| 118 // exactly, and also the floating point results from ConstrainedFit. | |
| 119 using DistPointPair = KDPairInc<double, ICOORD>; | |
| 120 | |
| 121 // Computes and returns the squared evaluation metric for a line fit. | |
| 122 double EvaluateLineFit(); | |
| 123 | |
| 124 // Computes the absolute values of the precomputed distances_, | |
| 125 // and returns the squared upper-quartile error distance. | |
| 126 double ComputeUpperQuartileError(); | |
| 127 | |
| 128 // Returns the number of sample points that have an error more than threshold. | |
| 129 int NumberOfMisfittedPoints(double threshold) const; | |
| 130 | |
| 131 // Computes all the cross product distances of the points from the line, | |
| 132 // storing the actual (signed) cross products in distances_. | |
| 133 // Ignores distances of points that are further away than the previous point, | |
| 134 // and overlaps the previous point by at least half. | |
| 135 void ComputeDistances(const ICOORD &start, const ICOORD &end); | |
| 136 | |
| 137 // Computes all the cross product distances of the points perpendicular to | |
| 138 // the given direction, ignoring distances outside of the give distance range, | |
| 139 // storing the actual (signed) cross products in distances_. | |
| 140 void ComputeConstrainedDistances(const FCOORD &direction, double min_dist, double max_dist); | |
| 141 | |
| 142 // Stores all the source points in the order they were given and their | |
| 143 // halfwidths, if any. | |
| 144 std::vector<PointWidth> pts_; | |
| 145 // Stores the computed perpendicular distances of (some of) the pts_ from a | |
| 146 // given vector (assuming it goes through the origin, making it a line). | |
| 147 // Since the distances may be a subset of the input points, and get | |
| 148 // re-ordered by the nth_item function, the original point is stored | |
| 149 // along side the distance. | |
| 150 std::vector<DistPointPair> distances_; // Distances of points. | |
| 151 // The squared length of the vector used to compute distances_. | |
| 152 double square_length_; | |
| 153 }; | |
| 154 | |
| 155 } // namespace tesseract. | |
| 156 | |
| 157 #endif // TESSERACT_CCSTRUCT_DETLINEFIT_H_ |
