diff mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,172 @@
+///////////////////////////////////////////////////////////////////////
+// File:        colpartitionset.h
+// Description: Class to hold a list of ColPartitions of the page that
+//              correspond roughly to columns.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H_
+#define TESSERACT_TEXTORD_COLPARTITIONSET_H_
+
+#include "colpartition.h" // For ColPartition_LIST.
+#include "rect.h"         // For TBOX.
+#include "tabvector.h"    // For BLOBNBOX_CLIST.
+
+namespace tesseract {
+
+class WorkingPartSet_LIST;
+class ColSegment_LIST;
+class ColPartitionSet;
+using PartSetVector = std::vector<ColPartitionSet *>;
+
+// ColPartitionSet is a class that holds a list of ColPartitions.
+// Its main use is in holding a candidate partitioning of the width of the
+// image into columns, where each member ColPartition is a single column.
+// ColPartitionSets are used in building the column layout of a page.
+class ColPartitionSet : public ELIST_LINK {
+public:
+  ColPartitionSet() = default;
+  explicit ColPartitionSet(ColPartition_LIST *partitions);
+  explicit ColPartitionSet(ColPartition *partition);
+
+  ~ColPartitionSet() = default;
+
+  // Simple accessors.
+  const TBOX &bounding_box() const {
+    return bounding_box_;
+  }
+  bool Empty() const {
+    return parts_.empty();
+  }
+  int ColumnCount() const {
+    return parts_.length();
+  }
+
+  // Returns the number of columns of good width.
+  int GoodColumnCount() const;
+
+  // Return an element of the parts_ list from its index.
+  ColPartition *GetColumnByIndex(int index);
+
+  // Return the ColPartition that contains the given coords, if any, else
+  // nullptr.
+  ColPartition *ColumnContaining(int x, int y);
+
+  // Return the bounding boxes of columns at the given y-range
+  void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);
+
+  // Extract all the parts from the list, relinquishing ownership.
+  void RelinquishParts();
+
+  // Attempt to improve this by adding partitions or expanding partitions.
+  void ImproveColumnCandidate(const WidthCallback &cb, PartSetVector *src_sets);
+
+  // If this set is good enough to represent a new partitioning into columns,
+  // add it to the vector of sets, otherwise delete it.
+  void AddToColumnSetsIfUnique(PartSetVector *column_sets,
+                               const WidthCallback &cb);
+
+  // Return true if the partitions in other are all compatible with the columns
+  // in this.
+  bool CompatibleColumns(bool debug, ColPartitionSet *other,
+                         const WidthCallback &cb);
+
+  // Returns the total width of all blobs in the part_set that do not lie
+  // within an approved column. Used as a cost measure for using this
+  // column set over another that might be compatible.
+  int UnmatchedWidth(ColPartitionSet *part_set);
+
+  // Return true if this ColPartitionSet makes a legal column candidate by
+  // having legal individual partitions and non-overlapping adjacent pairs.
+  bool LegalColumnCandidate();
+
+  // Return a copy of this. If good_only will only copy the Good ColPartitions.
+  ColPartitionSet *Copy(bool good_only);
+
+  // Display the edges of the columns at the given y coords.
+  void DisplayColumnEdges(int y_bottom, int y_top, ScrollView *win);
+
+  // Return the ColumnSpanningType that best explains the columns overlapped
+  // by the given coords(left,right,y), with the given margins.
+  // Also return the first and last column index touched by the coords and
+  // the leftmost spanned column.
+  // Column indices are 2n + 1 for real columns (0 based) and even values
+  // represent the gaps in between columns, with 0 being left of the leftmost.
+  // resolution refers to the ppi resolution of the image. It may be 0 if only
+  // the first_col and last_col are required.
+  ColumnSpanningType SpanningType(int resolution, int left, int right,
+                                  int height, int y, int left_margin,
+                                  int right_margin, int *first_col,
+                                  int *last_col, int *first_spanned_col);
+
+  // The column_set has changed. Close down all in-progress WorkingPartSets in
+  // columns that do not match and start new ones for the new columns in this.
+  // As ColPartitions are turned into BLOCKs, the used ones are put in
+  // used_parts, as they still need to be referenced in the grid.
+  void ChangeWorkColumns(const ICOORD &bleft, const ICOORD &tright,
+                         int resolution, ColPartition_LIST *used_parts,
+                         WorkingPartSet_LIST *working_set);
+
+  // Accumulate the widths and gaps into the given variables.
+  void AccumulateColumnWidthsAndGaps(int *total_width, int *width_samples,
+                                     int *total_gap, int *gap_samples);
+
+  // Provide debug output for this ColPartitionSet and all the ColPartitions.
+  void Print();
+
+private:
+  // Add the given partition to the list in the appropriate place.
+  void AddPartition(ColPartition *new_part, ColPartition_IT *it);
+
+  // Compute the coverage and good column count. Coverage is the amount of the
+  // width of the page (in pixels) that is covered by ColPartitions, which are
+  // used to provide candidate column layouts.
+  // Coverage is split into good and bad. Good coverage is provided by
+  // ColPartitions of a frequent width (according to the callback function
+  // provided by TabFinder::WidthCB, which accesses stored statistics on the
+  // widths of ColPartitions) and bad coverage is provided by all other
+  // ColPartitions, even if they have tab vectors at both sides. Thus:
+  // |-----------------------------------------------------------------|
+  // |        Double     width    heading                              |
+  // |-----------------------------------------------------------------|
+  // |-------------------------------| |-------------------------------|
+  // |   Common width ColPartition   | |  Common width ColPartition    |
+  // |-------------------------------| |-------------------------------|
+  // the layout with two common-width columns has better coverage than the
+  // double width heading, because the coverage is "good," even though less in
+  // total coverage than the heading, because the heading coverage is "bad."
+  void ComputeCoverage();
+
+  // Adds the coverage, column count and box for a single partition,
+  // without adding it to the list. (Helper factored from ComputeCoverage.)
+  void AddPartitionCoverageAndBox(const ColPartition &part);
+
+  // The partitions in this column candidate.
+  ColPartition_LIST parts_;
+  // The number of partitions that have a frequent column width.
+  int good_column_count_;
+  // Total width of all the good ColPartitions.
+  int good_coverage_;
+  // Total width of all the bad ColPartitions.
+  int bad_coverage_;
+  // Bounding box of all partitions in the set.
+  TBOX bounding_box_;
+};
+
+ELISTIZEH(ColPartitionSet)
+
+} // namespace tesseract.
+
+#endif // TESSERACT_TEXTORD_COLPARTITION_H_