Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/detlinefit.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: detlinefit.cpp | |
| 3 // Description: Deterministic least median squares line fitting. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2008, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "detlinefit.h" | |
| 20 #include "helpers.h" // for IntCastRounded | |
| 21 #include "statistc.h" | |
| 22 #include "tesserrstream.h" // for tesserr | |
| 23 | |
| 24 #include <algorithm> | |
| 25 #include <cfloat> // for FLT_MAX | |
| 26 | |
| 27 namespace tesseract { | |
| 28 | |
| 29 // The number of points to consider at each end. | |
| 30 const int kNumEndPoints = 3; | |
| 31 // The minimum number of points at which to switch to number of points | |
| 32 // for badly fitted lines. | |
| 33 // To ensure a sensible error metric, kMinPointsForErrorCount should be at | |
| 34 // least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in | |
| 35 // ComputeUpperQuartileError. | |
| 36 const int kMinPointsForErrorCount = 16; | |
| 37 // The maximum real distance to use before switching to number of | |
| 38 // mis-fitted points, which will get square-rooted for true distance. | |
| 39 const int kMaxRealDistance = 2.0; | |
| 40 | |
| 41 DetLineFit::DetLineFit() : square_length_(0.0) {} | |
| 42 | |
| 43 // Delete all Added points. | |
| 44 void DetLineFit::Clear() { | |
| 45 pts_.clear(); | |
| 46 distances_.clear(); | |
| 47 } | |
| 48 | |
| 49 // Add a new point. Takes a copy - the pt doesn't need to stay in scope. | |
| 50 void DetLineFit::Add(const ICOORD &pt) { | |
| 51 pts_.emplace_back(pt, 0); | |
| 52 } | |
| 53 // Associates a half-width with the given point if a point overlaps the | |
| 54 // previous point by more than half the width, and its distance is further | |
| 55 // than the previous point, then the more distant point is ignored in the | |
| 56 // distance calculation. Useful for ignoring i dots and other diacritics. | |
| 57 void DetLineFit::Add(const ICOORD &pt, int halfwidth) { | |
| 58 pts_.emplace_back(pt, halfwidth); | |
| 59 } | |
| 60 | |
| 61 // Fits a line to the points, ignoring the skip_first initial points and the | |
| 62 // skip_last final points, returning the fitted line as a pair of points, | |
| 63 // and the upper quartile error. | |
| 64 double DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) { | |
| 65 // Do something sensible with no points. | |
| 66 if (pts_.empty()) { | |
| 67 pt1->set_x(0); | |
| 68 pt1->set_y(0); | |
| 69 *pt2 = *pt1; | |
| 70 return 0.0; | |
| 71 } | |
| 72 // Count the points and find the first and last kNumEndPoints. | |
| 73 int pt_count = pts_.size(); | |
| 74 ICOORD *starts[kNumEndPoints]; | |
| 75 if (skip_first >= pt_count) { | |
| 76 skip_first = pt_count - 1; | |
| 77 } | |
| 78 int start_count = 0; | |
| 79 int end_i = std::min(skip_first + kNumEndPoints, pt_count); | |
| 80 for (int i = skip_first; i < end_i; ++i) { | |
| 81 starts[start_count++] = &pts_[i].pt; | |
| 82 } | |
| 83 ICOORD *ends[kNumEndPoints]; | |
| 84 if (skip_last >= pt_count) { | |
| 85 skip_last = pt_count - 1; | |
| 86 } | |
| 87 int end_count = 0; | |
| 88 end_i = std::max(0, pt_count - kNumEndPoints - skip_last); | |
| 89 for (int i = pt_count - 1 - skip_last; i >= end_i; --i) { | |
| 90 ends[end_count++] = &pts_[i].pt; | |
| 91 } | |
| 92 // 1 or 2 points need special treatment. | |
| 93 if (pt_count <= 2) { | |
| 94 *pt1 = *starts[0]; | |
| 95 if (pt_count > 1) { | |
| 96 *pt2 = *ends[0]; | |
| 97 } else { | |
| 98 *pt2 = *pt1; | |
| 99 } | |
| 100 return 0.0; | |
| 101 } | |
| 102 // Although with between 2 and 2*kNumEndPoints-1 points, there will be | |
| 103 // overlap in the starts, ends sets, this is OK and taken care of by the | |
| 104 // if (*start != *end) test below, which also tests for equal input points. | |
| 105 double best_uq = -1.0; | |
| 106 // Iterate each pair of points and find the best fitting line. | |
| 107 for (int i = 0; i < start_count; ++i) { | |
| 108 ICOORD *start = starts[i]; | |
| 109 for (int j = 0; j < end_count; ++j) { | |
| 110 ICOORD *end = ends[j]; | |
| 111 if (*start != *end) { | |
| 112 ComputeDistances(*start, *end); | |
| 113 // Compute the upper quartile error from the line. | |
| 114 double dist = EvaluateLineFit(); | |
| 115 if (dist < best_uq || best_uq < 0.0) { | |
| 116 best_uq = dist; | |
| 117 *pt1 = *start; | |
| 118 *pt2 = *end; | |
| 119 } | |
| 120 } | |
| 121 } | |
| 122 } | |
| 123 // Finally compute the square root to return the true distance. | |
| 124 return best_uq > 0.0 ? sqrt(best_uq) : best_uq; | |
| 125 } | |
| 126 | |
| 127 // Constrained fit with a supplied direction vector. Finds the best line_pt, | |
| 128 // that is one of the supplied points having the median cross product with | |
| 129 // direction, ignoring points that have a cross product outside of the range | |
| 130 // [min_dist, max_dist]. Returns the resulting error metric using the same | |
| 131 // reduced set of points. | |
| 132 // *Makes use of floating point arithmetic* | |
| 133 double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, | |
| 134 bool debug, ICOORD *line_pt) { | |
| 135 ComputeConstrainedDistances(direction, min_dist, max_dist); | |
| 136 // Do something sensible with no points or computed distances. | |
| 137 if (pts_.empty() || distances_.empty()) { | |
| 138 line_pt->set_x(0); | |
| 139 line_pt->set_y(0); | |
| 140 return 0.0; | |
| 141 } | |
| 142 auto median_index = distances_.size() / 2; | |
| 143 std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end()); | |
| 144 *line_pt = distances_[median_index].data(); | |
| 145 if (debug) { | |
| 146 tesserr << "Constrained fit to dir " << direction.x() << ", " | |
| 147 << direction.y() << " = " | |
| 148 << line_pt->x() << ", " << line_pt->y() | |
| 149 << " :" << distances_.size() << " distances:\n"; | |
| 150 for (unsigned i = 0; i < distances_.size(); ++i) { | |
| 151 tesserr << i << ": " | |
| 152 << distances_[i].data().x() << ", " | |
| 153 << distances_[i].data().y() << " -> " | |
| 154 << distances_[i].key() << '\n'; | |
| 155 } | |
| 156 tesserr << "Result = " << median_index << '\n'; | |
| 157 } | |
| 158 // Center distances on the fitted point. | |
| 159 double dist_origin = direction * *line_pt; | |
| 160 for (auto &distance : distances_) { | |
| 161 distance.key() -= dist_origin; | |
| 162 } | |
| 163 return sqrt(EvaluateLineFit()); | |
| 164 } | |
| 165 | |
| 166 // Returns true if there were enough points at the last call to Fit or | |
| 167 // ConstrainedFit for the fitted points to be used on a badly fitted line. | |
| 168 bool DetLineFit::SufficientPointsForIndependentFit() const { | |
| 169 return distances_.size() >= kMinPointsForErrorCount; | |
| 170 } | |
| 171 | |
| 172 // Backwards compatible fit returning a gradient and constant. | |
| 173 // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this | |
| 174 // function in preference to the LMS class. | |
| 175 double DetLineFit::Fit(float *m, float *c) { | |
| 176 ICOORD start, end; | |
| 177 double error = Fit(&start, &end); | |
| 178 if (end.x() != start.x()) { | |
| 179 *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x()); | |
| 180 *c = start.y() - *m * start.x(); | |
| 181 } else { | |
| 182 *m = 0.0f; | |
| 183 *c = 0.0f; | |
| 184 } | |
| 185 return error; | |
| 186 } | |
| 187 | |
| 188 // Backwards compatible constrained fit with a supplied gradient. | |
| 189 // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible | |
| 190 // to avoid potential difficulties with infinite gradients. | |
| 191 double DetLineFit::ConstrainedFit(double m, float *c) { | |
| 192 // Do something sensible with no points. | |
| 193 if (pts_.empty()) { | |
| 194 *c = 0.0f; | |
| 195 return 0.0; | |
| 196 } | |
| 197 double cos = 1.0 / sqrt(1.0 + m * m); | |
| 198 FCOORD direction(cos, m * cos); | |
| 199 ICOORD line_pt; | |
| 200 double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt); | |
| 201 *c = line_pt.y() - line_pt.x() * m; | |
| 202 return error; | |
| 203 } | |
| 204 | |
| 205 // Computes and returns the squared evaluation metric for a line fit. | |
| 206 double DetLineFit::EvaluateLineFit() { | |
| 207 // Compute the upper quartile error from the line. | |
| 208 double dist = ComputeUpperQuartileError(); | |
| 209 if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) { | |
| 210 // Use the number of mis-fitted points as the error metric, as this | |
| 211 // gives a better measure of fit for badly fitted lines where more | |
| 212 // than a quarter are badly fitted. | |
| 213 double threshold = kMaxRealDistance * sqrt(square_length_); | |
| 214 dist = NumberOfMisfittedPoints(threshold); | |
| 215 } | |
| 216 return dist; | |
| 217 } | |
| 218 | |
| 219 // Computes the absolute error distances of the points from the line, | |
| 220 // and returns the squared upper-quartile error distance. | |
| 221 double DetLineFit::ComputeUpperQuartileError() { | |
| 222 int num_errors = distances_.size(); | |
| 223 if (num_errors == 0) { | |
| 224 return 0.0; | |
| 225 } | |
| 226 // Get the absolute values of the errors. | |
| 227 for (int i = 0; i < num_errors; ++i) { | |
| 228 if (distances_[i].key() < 0) { | |
| 229 distances_[i].key() = -distances_[i].key(); | |
| 230 } | |
| 231 } | |
| 232 // Now get the upper quartile distance. | |
| 233 auto index = 3 * num_errors / 4; | |
| 234 std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end()); | |
| 235 double dist = distances_[index].key(); | |
| 236 // The true distance is the square root of the dist squared / square_length. | |
| 237 // Don't bother with the square root. Just return the square distance. | |
| 238 return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0; | |
| 239 } | |
| 240 | |
| 241 // Returns the number of sample points that have an error more than threshold. | |
| 242 int DetLineFit::NumberOfMisfittedPoints(double threshold) const { | |
| 243 int num_misfits = 0; | |
| 244 int num_dists = distances_.size(); | |
| 245 // Get the absolute values of the errors. | |
| 246 for (int i = 0; i < num_dists; ++i) { | |
| 247 if (distances_[i].key() > threshold) { | |
| 248 ++num_misfits; | |
| 249 } | |
| 250 } | |
| 251 return num_misfits; | |
| 252 } | |
| 253 | |
| 254 // Computes all the cross product distances of the points from the line, | |
| 255 // storing the actual (signed) cross products in distances. | |
| 256 // Ignores distances of points that are further away than the previous point, | |
| 257 // and overlaps the previous point by at least half. | |
| 258 void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) { | |
| 259 distances_.clear(); | |
| 260 ICOORD line_vector = end; | |
| 261 line_vector -= start; | |
| 262 square_length_ = line_vector.sqlength(); | |
| 263 int line_length = IntCastRounded(sqrt(square_length_)); | |
| 264 // Compute the distance of each point from the line. | |
| 265 int prev_abs_dist = 0; | |
| 266 int prev_dot = 0; | |
| 267 for (unsigned i = 0; i < pts_.size(); ++i) { | |
| 268 ICOORD pt_vector = pts_[i].pt; | |
| 269 pt_vector -= start; | |
| 270 int dot = line_vector % pt_vector; | |
| 271 // Compute |line_vector||pt_vector|sin(angle between) | |
| 272 int dist = line_vector * pt_vector; | |
| 273 int abs_dist = dist < 0 ? -dist : dist; | |
| 274 if (abs_dist > prev_abs_dist && i > 0) { | |
| 275 // Ignore this point if it overlaps the previous one. | |
| 276 int separation = abs(dot - prev_dot); | |
| 277 if (separation < line_length * pts_[i].halfwidth || | |
| 278 separation < line_length * pts_[i - 1].halfwidth) { | |
| 279 continue; | |
| 280 } | |
| 281 } | |
| 282 distances_.emplace_back(dist, pts_[i].pt); | |
| 283 prev_abs_dist = abs_dist; | |
| 284 prev_dot = dot; | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 // Computes all the cross product distances of the points perpendicular to | |
| 289 // the given direction, ignoring distances outside of the give distance range, | |
| 290 // storing the actual (signed) cross products in distances_. | |
| 291 void DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist, | |
| 292 double max_dist) { | |
| 293 distances_.clear(); | |
| 294 square_length_ = direction.sqlength(); | |
| 295 // Compute the distance of each point from the line. | |
| 296 for (auto &pt : pts_) { | |
| 297 FCOORD pt_vector = pt.pt; | |
| 298 // Compute |line_vector||pt_vector|sin(angle between) | |
| 299 double dist = direction * pt_vector; | |
| 300 if (min_dist <= dist && dist <= max_dist) { | |
| 301 distances_.emplace_back(dist, pt.pt); | |
| 302 } | |
| 303 } | |
| 304 } | |
| 305 | |
| 306 } // namespace tesseract. |
