Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,172 @@ +/////////////////////////////////////////////////////////////////////// +// File: colpartitionset.h +// Description: Class to hold a list of ColPartitions of the page that +// correspond roughly to columns. +// Author: Ray Smith +// +// (C) Copyright 2008, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H_ +#define TESSERACT_TEXTORD_COLPARTITIONSET_H_ + +#include "colpartition.h" // For ColPartition_LIST. +#include "rect.h" // For TBOX. +#include "tabvector.h" // For BLOBNBOX_CLIST. + +namespace tesseract { + +class WorkingPartSet_LIST; +class ColSegment_LIST; +class ColPartitionSet; +using PartSetVector = std::vector<ColPartitionSet *>; + +// ColPartitionSet is a class that holds a list of ColPartitions. +// Its main use is in holding a candidate partitioning of the width of the +// image into columns, where each member ColPartition is a single column. +// ColPartitionSets are used in building the column layout of a page. +class ColPartitionSet : public ELIST_LINK { +public: + ColPartitionSet() = default; + explicit ColPartitionSet(ColPartition_LIST *partitions); + explicit ColPartitionSet(ColPartition *partition); + + ~ColPartitionSet() = default; + + // Simple accessors. + const TBOX &bounding_box() const { + return bounding_box_; + } + bool Empty() const { + return parts_.empty(); + } + int ColumnCount() const { + return parts_.length(); + } + + // Returns the number of columns of good width. + int GoodColumnCount() const; + + // Return an element of the parts_ list from its index. + ColPartition *GetColumnByIndex(int index); + + // Return the ColPartition that contains the given coords, if any, else + // nullptr. + ColPartition *ColumnContaining(int x, int y); + + // Return the bounding boxes of columns at the given y-range + void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments); + + // Extract all the parts from the list, relinquishing ownership. + void RelinquishParts(); + + // Attempt to improve this by adding partitions or expanding partitions. + void ImproveColumnCandidate(const WidthCallback &cb, PartSetVector *src_sets); + + // If this set is good enough to represent a new partitioning into columns, + // add it to the vector of sets, otherwise delete it. + void AddToColumnSetsIfUnique(PartSetVector *column_sets, + const WidthCallback &cb); + + // Return true if the partitions in other are all compatible with the columns + // in this. + bool CompatibleColumns(bool debug, ColPartitionSet *other, + const WidthCallback &cb); + + // Returns the total width of all blobs in the part_set that do not lie + // within an approved column. Used as a cost measure for using this + // column set over another that might be compatible. + int UnmatchedWidth(ColPartitionSet *part_set); + + // Return true if this ColPartitionSet makes a legal column candidate by + // having legal individual partitions and non-overlapping adjacent pairs. + bool LegalColumnCandidate(); + + // Return a copy of this. If good_only will only copy the Good ColPartitions. + ColPartitionSet *Copy(bool good_only); + + // Display the edges of the columns at the given y coords. + void DisplayColumnEdges(int y_bottom, int y_top, ScrollView *win); + + // Return the ColumnSpanningType that best explains the columns overlapped + // by the given coords(left,right,y), with the given margins. + // Also return the first and last column index touched by the coords and + // the leftmost spanned column. + // Column indices are 2n + 1 for real columns (0 based) and even values + // represent the gaps in between columns, with 0 being left of the leftmost. + // resolution refers to the ppi resolution of the image. It may be 0 if only + // the first_col and last_col are required. + ColumnSpanningType SpanningType(int resolution, int left, int right, + int height, int y, int left_margin, + int right_margin, int *first_col, + int *last_col, int *first_spanned_col); + + // The column_set has changed. Close down all in-progress WorkingPartSets in + // columns that do not match and start new ones for the new columns in this. + // As ColPartitions are turned into BLOCKs, the used ones are put in + // used_parts, as they still need to be referenced in the grid. + void ChangeWorkColumns(const ICOORD &bleft, const ICOORD &tright, + int resolution, ColPartition_LIST *used_parts, + WorkingPartSet_LIST *working_set); + + // Accumulate the widths and gaps into the given variables. + void AccumulateColumnWidthsAndGaps(int *total_width, int *width_samples, + int *total_gap, int *gap_samples); + + // Provide debug output for this ColPartitionSet and all the ColPartitions. + void Print(); + +private: + // Add the given partition to the list in the appropriate place. + void AddPartition(ColPartition *new_part, ColPartition_IT *it); + + // Compute the coverage and good column count. Coverage is the amount of the + // width of the page (in pixels) that is covered by ColPartitions, which are + // used to provide candidate column layouts. + // Coverage is split into good and bad. Good coverage is provided by + // ColPartitions of a frequent width (according to the callback function + // provided by TabFinder::WidthCB, which accesses stored statistics on the + // widths of ColPartitions) and bad coverage is provided by all other + // ColPartitions, even if they have tab vectors at both sides. Thus: + // |-----------------------------------------------------------------| + // | Double width heading | + // |-----------------------------------------------------------------| + // |-------------------------------| |-------------------------------| + // | Common width ColPartition | | Common width ColPartition | + // |-------------------------------| |-------------------------------| + // the layout with two common-width columns has better coverage than the + // double width heading, because the coverage is "good," even though less in + // total coverage than the heading, because the heading coverage is "bad." + void ComputeCoverage(); + + // Adds the coverage, column count and box for a single partition, + // without adding it to the list. (Helper factored from ComputeCoverage.) + void AddPartitionCoverageAndBox(const ColPartition &part); + + // The partitions in this column candidate. + ColPartition_LIST parts_; + // The number of partitions that have a frequent column width. + int good_column_count_; + // Total width of all the good ColPartitions. + int good_coverage_; + // Total width of all the bad ColPartitions. + int bad_coverage_; + // Bounding box of all partitions in the set. + TBOX bounding_box_; +}; + +ELISTIZEH(ColPartitionSet) + +} // namespace tesseract. + +#endif // TESSERACT_TEXTORD_COLPARTITION_H_
