view mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.h @ 21:2f43e400f144

Provide an "all" target to build both the sdist and the wheel
author Franz Glasner <fzglas.hg@dom66.de>
date Fri, 19 Sep 2025 10:28:53 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

///////////////////////////////////////////////////////////////////////
// File:        colpartitionset.h
// Description: Class to hold a list of ColPartitions of the page that
//              correspond roughly to columns.
// Author:      Ray Smith
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H_
#define TESSERACT_TEXTORD_COLPARTITIONSET_H_

#include "colpartition.h" // For ColPartition_LIST.
#include "rect.h"         // For TBOX.
#include "tabvector.h"    // For BLOBNBOX_CLIST.

namespace tesseract {

class WorkingPartSet_LIST;
class ColSegment_LIST;
class ColPartitionSet;
using PartSetVector = std::vector<ColPartitionSet *>;

// ColPartitionSet is a class that holds a list of ColPartitions.
// Its main use is in holding a candidate partitioning of the width of the
// image into columns, where each member ColPartition is a single column.
// ColPartitionSets are used in building the column layout of a page.
class ColPartitionSet : public ELIST_LINK {
public:
  ColPartitionSet() = default;
  explicit ColPartitionSet(ColPartition_LIST *partitions);
  explicit ColPartitionSet(ColPartition *partition);

  ~ColPartitionSet() = default;

  // Simple accessors.
  const TBOX &bounding_box() const {
    return bounding_box_;
  }
  bool Empty() const {
    return parts_.empty();
  }
  int ColumnCount() const {
    return parts_.length();
  }

  // Returns the number of columns of good width.
  int GoodColumnCount() const;

  // Return an element of the parts_ list from its index.
  ColPartition *GetColumnByIndex(int index);

  // Return the ColPartition that contains the given coords, if any, else
  // nullptr.
  ColPartition *ColumnContaining(int x, int y);

  // Return the bounding boxes of columns at the given y-range
  void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);

  // Extract all the parts from the list, relinquishing ownership.
  void RelinquishParts();

  // Attempt to improve this by adding partitions or expanding partitions.
  void ImproveColumnCandidate(const WidthCallback &cb, PartSetVector *src_sets);

  // If this set is good enough to represent a new partitioning into columns,
  // add it to the vector of sets, otherwise delete it.
  void AddToColumnSetsIfUnique(PartSetVector *column_sets,
                               const WidthCallback &cb);

  // Return true if the partitions in other are all compatible with the columns
  // in this.
  bool CompatibleColumns(bool debug, ColPartitionSet *other,
                         const WidthCallback &cb);

  // Returns the total width of all blobs in the part_set that do not lie
  // within an approved column. Used as a cost measure for using this
  // column set over another that might be compatible.
  int UnmatchedWidth(ColPartitionSet *part_set);

  // Return true if this ColPartitionSet makes a legal column candidate by
  // having legal individual partitions and non-overlapping adjacent pairs.
  bool LegalColumnCandidate();

  // Return a copy of this. If good_only will only copy the Good ColPartitions.
  ColPartitionSet *Copy(bool good_only);

  // Display the edges of the columns at the given y coords.
  void DisplayColumnEdges(int y_bottom, int y_top, ScrollView *win);

  // Return the ColumnSpanningType that best explains the columns overlapped
  // by the given coords(left,right,y), with the given margins.
  // Also return the first and last column index touched by the coords and
  // the leftmost spanned column.
  // Column indices are 2n + 1 for real columns (0 based) and even values
  // represent the gaps in between columns, with 0 being left of the leftmost.
  // resolution refers to the ppi resolution of the image. It may be 0 if only
  // the first_col and last_col are required.
  ColumnSpanningType SpanningType(int resolution, int left, int right,
                                  int height, int y, int left_margin,
                                  int right_margin, int *first_col,
                                  int *last_col, int *first_spanned_col);

  // The column_set has changed. Close down all in-progress WorkingPartSets in
  // columns that do not match and start new ones for the new columns in this.
  // As ColPartitions are turned into BLOCKs, the used ones are put in
  // used_parts, as they still need to be referenced in the grid.
  void ChangeWorkColumns(const ICOORD &bleft, const ICOORD &tright,
                         int resolution, ColPartition_LIST *used_parts,
                         WorkingPartSet_LIST *working_set);

  // Accumulate the widths and gaps into the given variables.
  void AccumulateColumnWidthsAndGaps(int *total_width, int *width_samples,
                                     int *total_gap, int *gap_samples);

  // Provide debug output for this ColPartitionSet and all the ColPartitions.
  void Print();

private:
  // Add the given partition to the list in the appropriate place.
  void AddPartition(ColPartition *new_part, ColPartition_IT *it);

  // Compute the coverage and good column count. Coverage is the amount of the
  // width of the page (in pixels) that is covered by ColPartitions, which are
  // used to provide candidate column layouts.
  // Coverage is split into good and bad. Good coverage is provided by
  // ColPartitions of a frequent width (according to the callback function
  // provided by TabFinder::WidthCB, which accesses stored statistics on the
  // widths of ColPartitions) and bad coverage is provided by all other
  // ColPartitions, even if they have tab vectors at both sides. Thus:
  // |-----------------------------------------------------------------|
  // |        Double     width    heading                              |
  // |-----------------------------------------------------------------|
  // |-------------------------------| |-------------------------------|
  // |   Common width ColPartition   | |  Common width ColPartition    |
  // |-------------------------------| |-------------------------------|
  // the layout with two common-width columns has better coverage than the
  // double width heading, because the coverage is "good," even though less in
  // total coverage than the heading, because the heading coverage is "bad."
  void ComputeCoverage();

  // Adds the coverage, column count and box for a single partition,
  // without adding it to the list. (Helper factored from ComputeCoverage.)
  void AddPartitionCoverageAndBox(const ColPartition &part);

  // The partitions in this column candidate.
  ColPartition_LIST parts_;
  // The number of partitions that have a frequent column width.
  int good_column_count_;
  // Total width of all the good ColPartitions.
  int good_coverage_;
  // Total width of all the bad ColPartitions.
  int bad_coverage_;
  // Bounding box of all partitions in the set.
  TBOX bounding_box_;
};

ELISTIZEH(ColPartitionSet)

} // namespace tesseract.

#endif // TESSERACT_TEXTORD_COLPARTITION_H_