Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/tabvector.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/tabvector.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,423 @@ +/////////////////////////////////////////////////////////////////////// +// File: tabvector.h +// Description: Class to hold a near-vertical vector representing a tab-stop. +// Author: Ray Smith +// +// (C) Copyright 2008, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_TEXTORD_TABVECTOR_H_ +#define TESSERACT_TEXTORD_TABVECTOR_H_ + +#include "bbgrid.h" +#include "blobgrid.h" +#include "clst.h" +#include "elst.h" +#include "elst2.h" +#include "rect.h" + +#include <algorithm> + +class BLOBNBOX; +class ScrollView; + +namespace tesseract { + +extern double_VAR_H(textord_tabvector_vertical_gap_fraction); +extern double_VAR_H(textord_tabvector_vertical_box_ratio); + +// The alignment type that a tab vector represents. +// Keep this enum synced with kAlignmentNames in tabvector.cpp. +enum TabAlignment { + TA_LEFT_ALIGNED, + TA_LEFT_RAGGED, + TA_CENTER_JUSTIFIED, + TA_RIGHT_ALIGNED, + TA_RIGHT_RAGGED, + TA_SEPARATOR, + TA_COUNT +}; + +// Forward declarations. The classes use their own list types, so we +// need to make the list types first. +class TabFind; +class TabVector; +class TabConstraint; + +ELIST2IZEH(TabVector) +CLISTIZEH(TabVector) +ELISTIZEH(TabConstraint) + +// TabConstraint is a totally self-contained class to maintain +// a list of [min,max] constraints, each referring to a TabVector. +// The constraints are manipulated through static methods that act +// on a list of constraints. The list itself is cooperatively owned +// by the TabVectors of the constraints on the list and managed +// by implicit reference counting via the elements of the list. +class TabConstraint : public ELIST_LINK { +public: + // This empty constructor is here only so that the class can be ELISTIZED. + // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier + // and eliminate CLASSNAME##_copier. + TabConstraint() = default; + + // Create a constraint for the top or bottom of this TabVector. + static void CreateConstraint(TabVector *vector, bool is_top); + + // Test to see if the constraints are compatible enough to merge. + static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2); + + // Merge the lists of constraints and update the TabVector pointers. + // The second list is deleted. + static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2); + + // Set all the tops and bottoms as appropriate to a mean of the + // constrained range. Delete all the constraints and list. + static void ApplyConstraints(TabConstraint_LIST *constraints); + +private: + TabConstraint(TabVector *vector, bool is_top); + + // Get the max of the mins and the min of the maxes. + static void GetConstraints(TabConstraint_LIST *constraints, int *y_min, int *y_max); + + // The TabVector this constraint applies to. + TabVector *vector_; + // If true then we refer to the top of the vector_. + bool is_top_; + // The allowed range of this vector_. + int y_min_; + int y_max_; +}; + +// Class to hold information about a single vector +// that represents a tab stop or a rule line. +class TabVector : public ELIST2_LINK { +public: + // TODO(rays) fix this in elst.h line 1076, where it should use the + // copy constructor instead of operator=. + TabVector() = default; + ~TabVector() = default; + + // Public factory to build a TabVector from a list of boxes. + // The TabVector will be of the given alignment type. + // The input vertical vector is used in fitting, and the output + // vertical_x, vertical_y have the resulting line vector added to them + // if the alignment is not ragged. + // The extended_start_y and extended_end_y are the maximum possible + // extension to the line segment that can be used to align with others. + // The input CLIST of BLOBNBOX good_points is consumed and taken over. + static TabVector *FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y, + int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x, + int *vertical_y); + + // Build a ragged TabVector by copying another's direction, shifting it + // to match the given blob, and making its initial extent the height + // of the blob, but its extended bounds from the bounds of the original. + TabVector(const TabVector &src, TabAlignment alignment, const ICOORD &vertical_skew, + BLOBNBOX *blob); + + // Copies basic attributes of a tab vector for simple operations. + // Copies things such startpt, endpt, range, width. + // Does not copy things such as partners, boxes, or constraints. + // This is useful if you only need vector information for processing, such + // as in the table detection code. + TabVector *ShallowCopy() const; + + // Simple accessors. + const ICOORD &startpt() const { + return startpt_; + } + const ICOORD &endpt() const { + return endpt_; + } + int extended_ymax() const { + return extended_ymax_; + } + int extended_ymin() const { + return extended_ymin_; + } + int sort_key() const { + return sort_key_; + } + int mean_width() const { + return mean_width_; + } + void set_top_constraints(TabConstraint_LIST *constraints) { + top_constraints_ = constraints; + } + void set_bottom_constraints(TabConstraint_LIST *constraints) { + bottom_constraints_ = constraints; + } + TabVector_CLIST *partners() { + return &partners_; + } + void set_startpt(const ICOORD &start) { + startpt_ = start; + } + void set_endpt(const ICOORD &end) { + endpt_ = end; + } + bool intersects_other_lines() const { + return intersects_other_lines_; + } + void set_intersects_other_lines(bool value) { + intersects_other_lines_ = value; + } + + // Inline quasi-accessors that require some computation. + + // Compute the x coordinate at the given y coordinate. + int XAtY(int y) const { + int height = endpt_.y() - startpt_.y(); + if (height != 0) { + return (y - startpt_.y()) * (endpt_.x() - startpt_.x()) / height + startpt_.x(); + } else { + return startpt_.x(); + } + } + + // Compute the vertical overlap with the other TabVector. + int VOverlap(const TabVector &other) const { + return std::min(other.endpt_.y(), endpt_.y()) - std::max(other.startpt_.y(), startpt_.y()); + } + // Compute the vertical overlap with the given y bounds. + int VOverlap(int top_y, int bottom_y) const { + return std::min(top_y, static_cast<int>(endpt_.y())) - + std::max(bottom_y, static_cast<int>(startpt_.y())); + } + // Compute the extended vertical overlap with the given y bounds. + int ExtendedOverlap(int top_y, int bottom_y) const { + return std::min(top_y, extended_ymax_) - std::max(bottom_y, extended_ymin_); + } + + // Return true if this is a left tab stop, either aligned, or ragged. + bool IsLeftTab() const { + return alignment_ == TA_LEFT_ALIGNED || alignment_ == TA_LEFT_RAGGED; + } + // Return true if this is a right tab stop, either aligned, or ragged. + bool IsRightTab() const { + return alignment_ == TA_RIGHT_ALIGNED || alignment_ == TA_RIGHT_RAGGED; + } + // Return true if this is a separator. + bool IsSeparator() const { + return alignment_ == TA_SEPARATOR; + } + // Return true if this is a center aligned tab stop. + bool IsCenterTab() const { + return alignment_ == TA_CENTER_JUSTIFIED; + } + // Return true if this is a ragged tab top, either left or right. + bool IsRagged() const { + return alignment_ == TA_LEFT_RAGGED || alignment_ == TA_RIGHT_RAGGED; + } + + // Return true if this vector is to the left of the other in terms + // of sort_key_. + bool IsLeftOf(const TabVector &other) const { + return sort_key_ < other.sort_key_; + } + + // Return true if the vector has no partners. + bool Partnerless() { + return partners_.empty(); + } + + // Return the number of tab boxes in this vector. + int BoxCount() { + return boxes_.length(); + } + + // Lock the vector from refits by clearing the boxes_ list. + void Freeze() { + boxes_.shallow_clear(); + } + + // Flip x and y on the ends so a vector can be created from flipped input. + void XYFlip() { + int x = startpt_.y(); + startpt_.set_y(startpt_.x()); + startpt_.set_x(x); + x = endpt_.y(); + endpt_.set_y(endpt_.x()); + endpt_.set_x(x); + } + + // Reflect the tab vector in the y-axis. + void ReflectInYAxis() { + startpt_.set_x(-startpt_.x()); + endpt_.set_x(-endpt_.x()); + sort_key_ = -sort_key_; + if (alignment_ == TA_LEFT_ALIGNED) { + alignment_ = TA_RIGHT_ALIGNED; + } else if (alignment_ == TA_RIGHT_ALIGNED) { + alignment_ = TA_LEFT_ALIGNED; + } + if (alignment_ == TA_LEFT_RAGGED) { + alignment_ = TA_RIGHT_RAGGED; + } else if (alignment_ == TA_RIGHT_RAGGED) { + alignment_ = TA_LEFT_RAGGED; + } + } + + // Separate function to compute the sort key for a given coordinate pair. + static int SortKey(const ICOORD &vertical, int x, int y) { + ICOORD pt(x, y); + return pt * vertical; + } + + // Return the x at the given y for the given sort key. + static int XAtY(const ICOORD &vertical, int sort_key, int y) { + if (vertical.y() != 0) { + return (vertical.x() * y + sort_key) / vertical.y(); + } else { + return sort_key; + } + } + + // Sort function for E2LIST::sort to sort by sort_key_. + static int SortVectorsByKey(const void *v1, const void *v2) { + const TabVector *tv1 = *static_cast<const TabVector *const *>(v1); + const TabVector *tv2 = *static_cast<const TabVector *const *>(v2); + return tv1->sort_key_ - tv2->sort_key_; + } + + // More complex members. + + // Extend this vector to include the supplied blob if it doesn't + // already have it. + void ExtendToBox(BLOBNBOX *blob); + + // Set the ycoord of the start and move the xcoord to match. + void SetYStart(int start_y); + // Set the ycoord of the end and move the xcoord to match. + void SetYEnd(int end_y); + + // Rotate the ends by the given vector. + void Rotate(const FCOORD &rotation); + + // Setup the initial constraints, being the limits of + // the vector and the extended ends. + void SetupConstraints(); + + // Setup the constraints between the partners of this TabVector. + void SetupPartnerConstraints(); + + // Setup the constraints between this and its partner. + void SetupPartnerConstraints(TabVector *partner); + + // Use the constraints to modify the top and bottom. + void ApplyConstraints(); + + // Merge close tab vectors of the same side that overlap. + static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, + BlobGrid *grid); + + // Return true if this vector is the same side, overlaps, and close + // enough to the other to be merged. + bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const; + + // Eat the other TabVector into this and delete it. + void MergeWith(const ICOORD &vertical, TabVector *other); + + // Add a new element to the list of partner TabVectors. + // Partners must be added in order of increasing y coordinate of the text line + // that makes them partners. + // Groups of identical partners are merged into one. + void AddPartner(TabVector *partner); + + // Return true if other is a partner of this. + bool IsAPartner(const TabVector *other); + + // Print basic information about this tab vector. + void Print(const char *prefix); + + // Print basic information about this tab vector and every box in it. + void Debug(const char *prefix); + + // Draw this tabvector in place in the given window. + void Display(ScrollView *tab_win); + + // Refit the line and/or re-evaluate the vector if the dirty flags are set. + void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder); + + // Evaluate the vector in terms of coverage of its length by good-looking + // box edges. A good looking box is one where its nearest neighbour on the + // inside is nearer than half the distance its nearest neighbour on the + // outside of the putative column. Bad boxes are removed from the line. + // A second pass then further filters boxes by requiring that the gutter + // width be a minimum fraction of the mean gutter along the line. + void Evaluate(const ICOORD &vertical, TabFind *finder); + + // (Re)Fit a line to the stored points. Returns false if the line + // is degenerate. Although the TabVector code mostly doesn't care about the + // direction of lines, XAtY would give silly results for a horizontal line. + // The class is mostly aimed at use for vertical lines representing + // horizontal tab stops. + bool Fit(ICOORD vertical, bool force_parallel); + + // Return the partner of this TabVector if the vector qualifies as + // being a vertical text line, otherwise nullptr. + TabVector *VerticalTextlinePartner(); + + // Return the matching tabvector if there is exactly one partner, or + // nullptr otherwise. This can be used after matching is done, eg. by + // VerticalTextlinePartner(), without checking if the line is vertical. + TabVector *GetSinglePartner(); + +private: + // Constructor is private as the static factory is the external way + // to build a TabVector. + TabVector(int extended_ymin, int extended_ymax, TabAlignment alignment, BLOBNBOX_CLIST *boxes); + + // Delete this, but first, repoint all the partners to point to + // replacement. If replacement is nullptr, then partner relationships + // are removed. + void Delete(TabVector *replacement); + +private: + // The bottom of the tab line. + ICOORD startpt_; + // The top of the tab line. + ICOORD endpt_; + // The lowest y that the vector might extend to. + int extended_ymin_ = 0; + // The highest y that the vector might extend to. + int extended_ymax_ = 0; + // Perpendicular distance of vector from a given vertical for sorting. + int sort_key_ = 0; + // Result of Evaluate 0-100. Coverage of line with good boxes. + int percent_score_ = 0; + // The mean width of the blobs. Meaningful only for separator lines. + int mean_width_ = 0; + // True if the boxes_ list has been modified, so a refit is needed. + bool needs_refit_ = false; + // True if a fit has been done, so re-evaluation is needed. + bool needs_evaluation_ = false; + // True if a separator line intersects at least 2 other lines. + bool intersects_other_lines_ = false; + // The type of this TabVector. + TabAlignment alignment_ = TA_LEFT_ALIGNED; + // The list of boxes whose edges are aligned at this TabVector. + BLOBNBOX_CLIST boxes_; + // List of TabVectors that have a connection with this via a text line. + TabVector_CLIST partners_; + // Constraints used to resolve the exact location of the top and bottom + // of the tab line. + TabConstraint_LIST *top_constraints_ = nullptr; + TabConstraint_LIST *bottom_constraints_ = nullptr; +}; + +} // namespace tesseract. + +#endif // TESSERACT_TEXTORD_TABVECTOR_H_
