Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tabvector.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tabvector.h | |
| 3 // Description: Class to hold a near-vertical vector representing a tab-stop. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2008, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_TEXTORD_TABVECTOR_H_ | |
| 20 #define TESSERACT_TEXTORD_TABVECTOR_H_ | |
| 21 | |
| 22 #include "bbgrid.h" | |
| 23 #include "blobgrid.h" | |
| 24 #include "clst.h" | |
| 25 #include "elst.h" | |
| 26 #include "elst2.h" | |
| 27 #include "rect.h" | |
| 28 | |
| 29 #include <algorithm> | |
| 30 | |
| 31 class BLOBNBOX; | |
| 32 class ScrollView; | |
| 33 | |
| 34 namespace tesseract { | |
| 35 | |
| 36 extern double_VAR_H(textord_tabvector_vertical_gap_fraction); | |
| 37 extern double_VAR_H(textord_tabvector_vertical_box_ratio); | |
| 38 | |
| 39 // The alignment type that a tab vector represents. | |
| 40 // Keep this enum synced with kAlignmentNames in tabvector.cpp. | |
| 41 enum TabAlignment { | |
| 42 TA_LEFT_ALIGNED, | |
| 43 TA_LEFT_RAGGED, | |
| 44 TA_CENTER_JUSTIFIED, | |
| 45 TA_RIGHT_ALIGNED, | |
| 46 TA_RIGHT_RAGGED, | |
| 47 TA_SEPARATOR, | |
| 48 TA_COUNT | |
| 49 }; | |
| 50 | |
| 51 // Forward declarations. The classes use their own list types, so we | |
| 52 // need to make the list types first. | |
| 53 class TabFind; | |
| 54 class TabVector; | |
| 55 class TabConstraint; | |
| 56 | |
| 57 ELIST2IZEH(TabVector) | |
| 58 CLISTIZEH(TabVector) | |
| 59 ELISTIZEH(TabConstraint) | |
| 60 | |
| 61 // TabConstraint is a totally self-contained class to maintain | |
| 62 // a list of [min,max] constraints, each referring to a TabVector. | |
| 63 // The constraints are manipulated through static methods that act | |
| 64 // on a list of constraints. The list itself is cooperatively owned | |
| 65 // by the TabVectors of the constraints on the list and managed | |
| 66 // by implicit reference counting via the elements of the list. | |
| 67 class TabConstraint : public ELIST_LINK { | |
| 68 public: | |
| 69 // This empty constructor is here only so that the class can be ELISTIZED. | |
| 70 // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier | |
| 71 // and eliminate CLASSNAME##_copier. | |
| 72 TabConstraint() = default; | |
| 73 | |
| 74 // Create a constraint for the top or bottom of this TabVector. | |
| 75 static void CreateConstraint(TabVector *vector, bool is_top); | |
| 76 | |
| 77 // Test to see if the constraints are compatible enough to merge. | |
| 78 static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2); | |
| 79 | |
| 80 // Merge the lists of constraints and update the TabVector pointers. | |
| 81 // The second list is deleted. | |
| 82 static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2); | |
| 83 | |
| 84 // Set all the tops and bottoms as appropriate to a mean of the | |
| 85 // constrained range. Delete all the constraints and list. | |
| 86 static void ApplyConstraints(TabConstraint_LIST *constraints); | |
| 87 | |
| 88 private: | |
| 89 TabConstraint(TabVector *vector, bool is_top); | |
| 90 | |
| 91 // Get the max of the mins and the min of the maxes. | |
| 92 static void GetConstraints(TabConstraint_LIST *constraints, int *y_min, int *y_max); | |
| 93 | |
| 94 // The TabVector this constraint applies to. | |
| 95 TabVector *vector_; | |
| 96 // If true then we refer to the top of the vector_. | |
| 97 bool is_top_; | |
| 98 // The allowed range of this vector_. | |
| 99 int y_min_; | |
| 100 int y_max_; | |
| 101 }; | |
| 102 | |
| 103 // Class to hold information about a single vector | |
| 104 // that represents a tab stop or a rule line. | |
| 105 class TabVector : public ELIST2_LINK { | |
| 106 public: | |
| 107 // TODO(rays) fix this in elst.h line 1076, where it should use the | |
| 108 // copy constructor instead of operator=. | |
| 109 TabVector() = default; | |
| 110 ~TabVector() = default; | |
| 111 | |
| 112 // Public factory to build a TabVector from a list of boxes. | |
| 113 // The TabVector will be of the given alignment type. | |
| 114 // The input vertical vector is used in fitting, and the output | |
| 115 // vertical_x, vertical_y have the resulting line vector added to them | |
| 116 // if the alignment is not ragged. | |
| 117 // The extended_start_y and extended_end_y are the maximum possible | |
| 118 // extension to the line segment that can be used to align with others. | |
| 119 // The input CLIST of BLOBNBOX good_points is consumed and taken over. | |
| 120 static TabVector *FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y, | |
| 121 int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x, | |
| 122 int *vertical_y); | |
| 123 | |
| 124 // Build a ragged TabVector by copying another's direction, shifting it | |
| 125 // to match the given blob, and making its initial extent the height | |
| 126 // of the blob, but its extended bounds from the bounds of the original. | |
| 127 TabVector(const TabVector &src, TabAlignment alignment, const ICOORD &vertical_skew, | |
| 128 BLOBNBOX *blob); | |
| 129 | |
| 130 // Copies basic attributes of a tab vector for simple operations. | |
| 131 // Copies things such startpt, endpt, range, width. | |
| 132 // Does not copy things such as partners, boxes, or constraints. | |
| 133 // This is useful if you only need vector information for processing, such | |
| 134 // as in the table detection code. | |
| 135 TabVector *ShallowCopy() const; | |
| 136 | |
| 137 // Simple accessors. | |
| 138 const ICOORD &startpt() const { | |
| 139 return startpt_; | |
| 140 } | |
| 141 const ICOORD &endpt() const { | |
| 142 return endpt_; | |
| 143 } | |
| 144 int extended_ymax() const { | |
| 145 return extended_ymax_; | |
| 146 } | |
| 147 int extended_ymin() const { | |
| 148 return extended_ymin_; | |
| 149 } | |
| 150 int sort_key() const { | |
| 151 return sort_key_; | |
| 152 } | |
| 153 int mean_width() const { | |
| 154 return mean_width_; | |
| 155 } | |
| 156 void set_top_constraints(TabConstraint_LIST *constraints) { | |
| 157 top_constraints_ = constraints; | |
| 158 } | |
| 159 void set_bottom_constraints(TabConstraint_LIST *constraints) { | |
| 160 bottom_constraints_ = constraints; | |
| 161 } | |
| 162 TabVector_CLIST *partners() { | |
| 163 return &partners_; | |
| 164 } | |
| 165 void set_startpt(const ICOORD &start) { | |
| 166 startpt_ = start; | |
| 167 } | |
| 168 void set_endpt(const ICOORD &end) { | |
| 169 endpt_ = end; | |
| 170 } | |
| 171 bool intersects_other_lines() const { | |
| 172 return intersects_other_lines_; | |
| 173 } | |
| 174 void set_intersects_other_lines(bool value) { | |
| 175 intersects_other_lines_ = value; | |
| 176 } | |
| 177 | |
| 178 // Inline quasi-accessors that require some computation. | |
| 179 | |
| 180 // Compute the x coordinate at the given y coordinate. | |
| 181 int XAtY(int y) const { | |
| 182 int height = endpt_.y() - startpt_.y(); | |
| 183 if (height != 0) { | |
| 184 return (y - startpt_.y()) * (endpt_.x() - startpt_.x()) / height + startpt_.x(); | |
| 185 } else { | |
| 186 return startpt_.x(); | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 // Compute the vertical overlap with the other TabVector. | |
| 191 int VOverlap(const TabVector &other) const { | |
| 192 return std::min(other.endpt_.y(), endpt_.y()) - std::max(other.startpt_.y(), startpt_.y()); | |
| 193 } | |
| 194 // Compute the vertical overlap with the given y bounds. | |
| 195 int VOverlap(int top_y, int bottom_y) const { | |
| 196 return std::min(top_y, static_cast<int>(endpt_.y())) - | |
| 197 std::max(bottom_y, static_cast<int>(startpt_.y())); | |
| 198 } | |
| 199 // Compute the extended vertical overlap with the given y bounds. | |
| 200 int ExtendedOverlap(int top_y, int bottom_y) const { | |
| 201 return std::min(top_y, extended_ymax_) - std::max(bottom_y, extended_ymin_); | |
| 202 } | |
| 203 | |
| 204 // Return true if this is a left tab stop, either aligned, or ragged. | |
| 205 bool IsLeftTab() const { | |
| 206 return alignment_ == TA_LEFT_ALIGNED || alignment_ == TA_LEFT_RAGGED; | |
| 207 } | |
| 208 // Return true if this is a right tab stop, either aligned, or ragged. | |
| 209 bool IsRightTab() const { | |
| 210 return alignment_ == TA_RIGHT_ALIGNED || alignment_ == TA_RIGHT_RAGGED; | |
| 211 } | |
| 212 // Return true if this is a separator. | |
| 213 bool IsSeparator() const { | |
| 214 return alignment_ == TA_SEPARATOR; | |
| 215 } | |
| 216 // Return true if this is a center aligned tab stop. | |
| 217 bool IsCenterTab() const { | |
| 218 return alignment_ == TA_CENTER_JUSTIFIED; | |
| 219 } | |
| 220 // Return true if this is a ragged tab top, either left or right. | |
| 221 bool IsRagged() const { | |
| 222 return alignment_ == TA_LEFT_RAGGED || alignment_ == TA_RIGHT_RAGGED; | |
| 223 } | |
| 224 | |
| 225 // Return true if this vector is to the left of the other in terms | |
| 226 // of sort_key_. | |
| 227 bool IsLeftOf(const TabVector &other) const { | |
| 228 return sort_key_ < other.sort_key_; | |
| 229 } | |
| 230 | |
| 231 // Return true if the vector has no partners. | |
| 232 bool Partnerless() { | |
| 233 return partners_.empty(); | |
| 234 } | |
| 235 | |
| 236 // Return the number of tab boxes in this vector. | |
| 237 int BoxCount() { | |
| 238 return boxes_.length(); | |
| 239 } | |
| 240 | |
| 241 // Lock the vector from refits by clearing the boxes_ list. | |
| 242 void Freeze() { | |
| 243 boxes_.shallow_clear(); | |
| 244 } | |
| 245 | |
| 246 // Flip x and y on the ends so a vector can be created from flipped input. | |
| 247 void XYFlip() { | |
| 248 int x = startpt_.y(); | |
| 249 startpt_.set_y(startpt_.x()); | |
| 250 startpt_.set_x(x); | |
| 251 x = endpt_.y(); | |
| 252 endpt_.set_y(endpt_.x()); | |
| 253 endpt_.set_x(x); | |
| 254 } | |
| 255 | |
| 256 // Reflect the tab vector in the y-axis. | |
| 257 void ReflectInYAxis() { | |
| 258 startpt_.set_x(-startpt_.x()); | |
| 259 endpt_.set_x(-endpt_.x()); | |
| 260 sort_key_ = -sort_key_; | |
| 261 if (alignment_ == TA_LEFT_ALIGNED) { | |
| 262 alignment_ = TA_RIGHT_ALIGNED; | |
| 263 } else if (alignment_ == TA_RIGHT_ALIGNED) { | |
| 264 alignment_ = TA_LEFT_ALIGNED; | |
| 265 } | |
| 266 if (alignment_ == TA_LEFT_RAGGED) { | |
| 267 alignment_ = TA_RIGHT_RAGGED; | |
| 268 } else if (alignment_ == TA_RIGHT_RAGGED) { | |
| 269 alignment_ = TA_LEFT_RAGGED; | |
| 270 } | |
| 271 } | |
| 272 | |
| 273 // Separate function to compute the sort key for a given coordinate pair. | |
| 274 static int SortKey(const ICOORD &vertical, int x, int y) { | |
| 275 ICOORD pt(x, y); | |
| 276 return pt * vertical; | |
| 277 } | |
| 278 | |
| 279 // Return the x at the given y for the given sort key. | |
| 280 static int XAtY(const ICOORD &vertical, int sort_key, int y) { | |
| 281 if (vertical.y() != 0) { | |
| 282 return (vertical.x() * y + sort_key) / vertical.y(); | |
| 283 } else { | |
| 284 return sort_key; | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 // Sort function for E2LIST::sort to sort by sort_key_. | |
| 289 static int SortVectorsByKey(const void *v1, const void *v2) { | |
| 290 const TabVector *tv1 = *static_cast<const TabVector *const *>(v1); | |
| 291 const TabVector *tv2 = *static_cast<const TabVector *const *>(v2); | |
| 292 return tv1->sort_key_ - tv2->sort_key_; | |
| 293 } | |
| 294 | |
| 295 // More complex members. | |
| 296 | |
| 297 // Extend this vector to include the supplied blob if it doesn't | |
| 298 // already have it. | |
| 299 void ExtendToBox(BLOBNBOX *blob); | |
| 300 | |
| 301 // Set the ycoord of the start and move the xcoord to match. | |
| 302 void SetYStart(int start_y); | |
| 303 // Set the ycoord of the end and move the xcoord to match. | |
| 304 void SetYEnd(int end_y); | |
| 305 | |
| 306 // Rotate the ends by the given vector. | |
| 307 void Rotate(const FCOORD &rotation); | |
| 308 | |
| 309 // Setup the initial constraints, being the limits of | |
| 310 // the vector and the extended ends. | |
| 311 void SetupConstraints(); | |
| 312 | |
| 313 // Setup the constraints between the partners of this TabVector. | |
| 314 void SetupPartnerConstraints(); | |
| 315 | |
| 316 // Setup the constraints between this and its partner. | |
| 317 void SetupPartnerConstraints(TabVector *partner); | |
| 318 | |
| 319 // Use the constraints to modify the top and bottom. | |
| 320 void ApplyConstraints(); | |
| 321 | |
| 322 // Merge close tab vectors of the same side that overlap. | |
| 323 static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, | |
| 324 BlobGrid *grid); | |
| 325 | |
| 326 // Return true if this vector is the same side, overlaps, and close | |
| 327 // enough to the other to be merged. | |
| 328 bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const; | |
| 329 | |
| 330 // Eat the other TabVector into this and delete it. | |
| 331 void MergeWith(const ICOORD &vertical, TabVector *other); | |
| 332 | |
| 333 // Add a new element to the list of partner TabVectors. | |
| 334 // Partners must be added in order of increasing y coordinate of the text line | |
| 335 // that makes them partners. | |
| 336 // Groups of identical partners are merged into one. | |
| 337 void AddPartner(TabVector *partner); | |
| 338 | |
| 339 // Return true if other is a partner of this. | |
| 340 bool IsAPartner(const TabVector *other); | |
| 341 | |
| 342 // Print basic information about this tab vector. | |
| 343 void Print(const char *prefix); | |
| 344 | |
| 345 // Print basic information about this tab vector and every box in it. | |
| 346 void Debug(const char *prefix); | |
| 347 | |
| 348 // Draw this tabvector in place in the given window. | |
| 349 void Display(ScrollView *tab_win); | |
| 350 | |
| 351 // Refit the line and/or re-evaluate the vector if the dirty flags are set. | |
| 352 void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder); | |
| 353 | |
| 354 // Evaluate the vector in terms of coverage of its length by good-looking | |
| 355 // box edges. A good looking box is one where its nearest neighbour on the | |
| 356 // inside is nearer than half the distance its nearest neighbour on the | |
| 357 // outside of the putative column. Bad boxes are removed from the line. | |
| 358 // A second pass then further filters boxes by requiring that the gutter | |
| 359 // width be a minimum fraction of the mean gutter along the line. | |
| 360 void Evaluate(const ICOORD &vertical, TabFind *finder); | |
| 361 | |
| 362 // (Re)Fit a line to the stored points. Returns false if the line | |
| 363 // is degenerate. Although the TabVector code mostly doesn't care about the | |
| 364 // direction of lines, XAtY would give silly results for a horizontal line. | |
| 365 // The class is mostly aimed at use for vertical lines representing | |
| 366 // horizontal tab stops. | |
| 367 bool Fit(ICOORD vertical, bool force_parallel); | |
| 368 | |
| 369 // Return the partner of this TabVector if the vector qualifies as | |
| 370 // being a vertical text line, otherwise nullptr. | |
| 371 TabVector *VerticalTextlinePartner(); | |
| 372 | |
| 373 // Return the matching tabvector if there is exactly one partner, or | |
| 374 // nullptr otherwise. This can be used after matching is done, eg. by | |
| 375 // VerticalTextlinePartner(), without checking if the line is vertical. | |
| 376 TabVector *GetSinglePartner(); | |
| 377 | |
| 378 private: | |
| 379 // Constructor is private as the static factory is the external way | |
| 380 // to build a TabVector. | |
| 381 TabVector(int extended_ymin, int extended_ymax, TabAlignment alignment, BLOBNBOX_CLIST *boxes); | |
| 382 | |
| 383 // Delete this, but first, repoint all the partners to point to | |
| 384 // replacement. If replacement is nullptr, then partner relationships | |
| 385 // are removed. | |
| 386 void Delete(TabVector *replacement); | |
| 387 | |
| 388 private: | |
| 389 // The bottom of the tab line. | |
| 390 ICOORD startpt_; | |
| 391 // The top of the tab line. | |
| 392 ICOORD endpt_; | |
| 393 // The lowest y that the vector might extend to. | |
| 394 int extended_ymin_ = 0; | |
| 395 // The highest y that the vector might extend to. | |
| 396 int extended_ymax_ = 0; | |
| 397 // Perpendicular distance of vector from a given vertical for sorting. | |
| 398 int sort_key_ = 0; | |
| 399 // Result of Evaluate 0-100. Coverage of line with good boxes. | |
| 400 int percent_score_ = 0; | |
| 401 // The mean width of the blobs. Meaningful only for separator lines. | |
| 402 int mean_width_ = 0; | |
| 403 // True if the boxes_ list has been modified, so a refit is needed. | |
| 404 bool needs_refit_ = false; | |
| 405 // True if a fit has been done, so re-evaluation is needed. | |
| 406 bool needs_evaluation_ = false; | |
| 407 // True if a separator line intersects at least 2 other lines. | |
| 408 bool intersects_other_lines_ = false; | |
| 409 // The type of this TabVector. | |
| 410 TabAlignment alignment_ = TA_LEFT_ALIGNED; | |
| 411 // The list of boxes whose edges are aligned at this TabVector. | |
| 412 BLOBNBOX_CLIST boxes_; | |
| 413 // List of TabVectors that have a connection with this via a text line. | |
| 414 TabVector_CLIST partners_; | |
| 415 // Constraints used to resolve the exact location of the top and bottom | |
| 416 // of the tab line. | |
| 417 TabConstraint_LIST *top_constraints_ = nullptr; | |
| 418 TabConstraint_LIST *bottom_constraints_ = nullptr; | |
| 419 }; | |
| 420 | |
| 421 } // namespace tesseract. | |
| 422 | |
| 423 #endif // TESSERACT_TEXTORD_TABVECTOR_H_ |
