Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,686 @@ +/////////////////////////////////////////////////////////////////////// +// File: colpartitionset.cpp +// Description: Class to hold a list of ColPartitions of the page that +// correspond roughly to columns. +// Author: Ray Smith +// +// (C) Copyright 2008, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "colpartitionset.h" +#include "tablefind.h" +#include "workingpartset.h" + +namespace tesseract { + +// Minimum width of a column to be interesting as a multiple of resolution. +const double kMinColumnWidth = 2.0 / 3; + +ColPartitionSet::ColPartitionSet(ColPartition_LIST *partitions) { + ColPartition_IT it(&parts_); + it.add_list_after(partitions); + ComputeCoverage(); +} + +ColPartitionSet::ColPartitionSet(ColPartition *part) { + ColPartition_IT it(&parts_); + it.add_after_then_move(part); + ComputeCoverage(); +} + +// Returns the number of columns of good width. +int ColPartitionSet::GoodColumnCount() const { + int num_good_cols = 0; + // This is a read-only iteration of the list. + ColPartition_IT it(const_cast<ColPartition_LIST *>(&parts_)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + if (it.data()->good_width()) { + ++num_good_cols; + } + } + return num_good_cols; +} + +// Return an element of the parts_ list from its index. +ColPartition *ColPartitionSet::GetColumnByIndex(int index) { + ColPartition_IT it(&parts_); + it.mark_cycle_pt(); + for (int i = 0; i < index && !it.cycled_list(); ++i, it.forward()) { + ; + } + if (it.cycled_list()) { + return nullptr; + } + return it.data(); +} + +// Return the ColPartition that contains the given coords, if any, else nullptr. +ColPartition *ColPartitionSet::ColumnContaining(int x, int y) { + ColPartition_IT it(&parts_); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + if (part->ColumnContains(x, y)) { + return part; + } + } + return nullptr; +} + +// Extract all the parts from the list, relinquishing ownership. +void ColPartitionSet::RelinquishParts() { + ColPartition_IT it(&parts_); + while (!it.empty()) { + it.extract(); + it.forward(); + } +} + +// Attempt to improve this by adding partitions or expanding partitions. +void ColPartitionSet::ImproveColumnCandidate(const WidthCallback &cb, + PartSetVector *src_sets) { + int set_size = src_sets->size(); + // Iterate over the provided column sets, as each one may have something + // to improve this. + for (int i = 0; i < set_size; ++i) { + ColPartitionSet *column_set = src_sets->at(i); + if (column_set == nullptr) { + continue; + } + // Iterate over the parts in this and column_set, adding bigger or + // new parts in column_set to this. + ColPartition_IT part_it(&parts_); + ASSERT_HOST(!part_it.empty()); + int prev_right = INT32_MIN; + part_it.mark_cycle_pt(); + ColPartition_IT col_it(&column_set->parts_); + for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) { + ColPartition *col_part = col_it.data(); + if (col_part->blob_type() < BRT_UNKNOWN) { + continue; // Ignore image partitions. + } + int col_left = col_part->left_key(); + int col_right = col_part->right_key(); + // Sync-up part_it (in this) so it matches the col_part in column_set. + ColPartition *part = part_it.data(); + while (!part_it.at_last() && part->right_key() < col_left) { + prev_right = part->right_key(); + part_it.forward(); + part = part_it.data(); + } + int part_left = part->left_key(); + int part_right = part->right_key(); + if (part_right < col_left || col_right < part_left) { + // There is no overlap so this is a new partition. + AddPartition(col_part->ShallowCopy(), &part_it); + continue; + } + // Check the edges of col_part to see if they can improve part. + bool part_width_ok = cb(part->KeyWidth(part_left, part_right)); + if (col_left < part_left && col_left > prev_right) { + // The left edge of the column is better and it doesn't overlap, + // so we can potentially expand it. + int col_box_left = col_part->BoxLeftKey(); + bool tab_width_ok = cb(part->KeyWidth(col_left, part_right)); + bool box_width_ok = cb(part->KeyWidth(col_box_left, part_right)); + if (tab_width_ok || (!part_width_ok)) { + // The tab is leaving the good column metric at least as good as + // it was before, so use the tab. + part->CopyLeftTab(*col_part, false); + part->SetColumnGoodness(cb); + } else if (col_box_left < part_left && + (box_width_ok || !part_width_ok)) { + // The box is leaving the good column metric at least as good as + // it was before, so use the box. + part->CopyLeftTab(*col_part, true); + part->SetColumnGoodness(cb); + } + part_left = part->left_key(); + } + if (col_right > part_right && + (part_it.at_last() || + part_it.data_relative(1)->left_key() > col_right)) { + // The right edge is better, so we can possibly expand it. + int col_box_right = col_part->BoxRightKey(); + bool tab_width_ok = cb(part->KeyWidth(part_left, col_right)); + bool box_width_ok = cb(part->KeyWidth(part_left, col_box_right)); + if (tab_width_ok || (!part_width_ok)) { + // The tab is leaving the good column metric at least as good as + // it was before, so use the tab. + part->CopyRightTab(*col_part, false); + part->SetColumnGoodness(cb); + } else if (col_box_right > part_right && + (box_width_ok || !part_width_ok)) { + // The box is leaving the good column metric at least as good as + // it was before, so use the box. + part->CopyRightTab(*col_part, true); + part->SetColumnGoodness(cb); + } + } + } + } + ComputeCoverage(); +} + +// If this set is good enough to represent a new partitioning into columns, +// add it to the vector of sets, otherwise delete it. +void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, + const WidthCallback &cb) { + bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(), + bounding_box_.bottom()); + if (debug) { + tprintf("Considering new column candidate:\n"); + Print(); + } + if (!LegalColumnCandidate()) { + if (debug) { + tprintf("Not a legal column candidate:\n"); + Print(); + } + delete this; + return; + } + for (unsigned i = 0; i < column_sets->size(); ++i) { + ColPartitionSet *columns = column_sets->at(i); + // In ordering the column set candidates, good_coverage_ is king, + // followed by good_column_count_ and then bad_coverage_. + bool better = good_coverage_ > columns->good_coverage_; + if (good_coverage_ == columns->good_coverage_) { + better = good_column_count_ > columns->good_column_count_; + if (good_column_count_ == columns->good_column_count_) { + better = bad_coverage_ > columns->bad_coverage_; + } + } + if (better) { + // The new one is better so add it. + if (debug) { + tprintf("Good one\n"); + } + column_sets->insert(column_sets->begin() + i, this); + return; + } + if (columns->CompatibleColumns(false, this, cb)) { + if (debug) { + tprintf("Duplicate\n"); + } + delete this; + return; // It is not unique. + } + } + if (debug) { + tprintf("Added to end\n"); + } + column_sets->push_back(this); +} + +// Return true if the partitions in other are all compatible with the columns +// in this. +bool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet *other, + const WidthCallback &cb) { + if (debug) { + tprintf("CompatibleColumns testing compatibility\n"); + Print(); + other->Print(); + } + if (other->parts_.empty()) { + if (debug) { + tprintf("CompatibleColumns true due to empty other\n"); + } + return true; + } + ColPartition_IT it(&other->parts_); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + if (part->blob_type() < BRT_UNKNOWN) { + if (debug) { + tprintf("CompatibleColumns ignoring image partition\n"); + part->Print(); + } + continue; // Image partitions are irrelevant to column compatibility. + } + int y = part->MidY(); + int left = part->bounding_box().left(); + int right = part->bounding_box().right(); + ColPartition *left_col = ColumnContaining(left, y); + ColPartition *right_col = ColumnContaining(right, y); + if (right_col == nullptr || left_col == nullptr) { + if (debug) { + tprintf("CompatibleColumns false due to partition edge outside\n"); + part->Print(); + } + return false; // A partition edge lies outside of all columns + } + if (right_col != left_col && cb(right - left)) { + if (debug) { + tprintf("CompatibleColumns false due to good width in multiple cols\n"); + part->Print(); + } + return false; // Partition with a good width must be in a single column. + } + + ColPartition_IT it2 = it; + while (!it2.at_last()) { + it2.forward(); + ColPartition *next_part = it2.data(); + if (!BLOBNBOX::IsTextType(next_part->blob_type())) { + continue; // Non-text partitions are irrelevant. + } + int next_left = next_part->bounding_box().left(); + if (next_left == right) { + break; // They share the same edge, so one must be a pull-out. + } + // Search to see if right and next_left fall within a single column. + ColPartition *next_left_col = ColumnContaining(next_left, y); + if (right_col == next_left_col) { + // There is a column break in this column. + // This can be due to a figure caption within a column, a pull-out + // block, or a simple broken textline that remains to be merged: + // all allowed, or a change in column layout: not allowed. + // If both partitions are of good width, then it is likely + // a change in column layout, otherwise probably an allowed situation. + if (part->good_width() && next_part->good_width()) { + if (debug) { + int next_right = next_part->bounding_box().right(); + tprintf("CompatibleColumns false due to 2 parts of good width\n"); + tprintf("part1 %d-%d, part2 %d-%d\n", left, right, next_left, + next_right); + right_col->Print(); + } + return false; + } + } + break; + } + } + if (debug) { + tprintf("CompatibleColumns true!\n"); + } + return true; +} + +// Returns the total width of all blobs in the part_set that do not lie +// within an approved column. Used as a cost measure for using this +// column set over another that might be compatible. +int ColPartitionSet::UnmatchedWidth(ColPartitionSet *part_set) { + int total_width = 0; + ColPartition_IT it(&part_set->parts_); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + if (!BLOBNBOX::IsTextType(part->blob_type())) { + continue; // Non-text partitions are irrelevant to column compatibility. + } + int y = part->MidY(); + BLOBNBOX_C_IT box_it(part->boxes()); + for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { + const TBOX &box = it.data()->bounding_box(); + // Assume that the whole blob is outside any column iff its x-middle + // is outside. + int x = (box.left() + box.right()) / 2; + ColPartition *col = ColumnContaining(x, y); + if (col == nullptr) { + total_width += box.width(); + } + } + } + return total_width; +} + +// Return true if this ColPartitionSet makes a legal column candidate by +// having legal individual partitions and non-overlapping adjacent pairs. +bool ColPartitionSet::LegalColumnCandidate() { + ColPartition_IT it(&parts_); + if (it.empty()) { + return false; + } + bool any_text_parts = false; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + if (BLOBNBOX::IsTextType(part->blob_type())) { + if (!part->IsLegal()) { + return false; // Individual partition is illegal. + } + any_text_parts = true; + } + if (!it.at_last()) { + ColPartition *next_part = it.data_relative(1); + if (next_part->left_key() < part->right_key()) { + return false; + } + } + } + return any_text_parts; +} + +// Return a copy of this. If good_only will only copy the Good ColPartitions. +ColPartitionSet *ColPartitionSet::Copy(bool good_only) { + ColPartition_LIST copy_parts; + ColPartition_IT src_it(&parts_); + ColPartition_IT dest_it(©_parts); + for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { + ColPartition *part = src_it.data(); + if (BLOBNBOX::IsTextType(part->blob_type()) && + (!good_only || part->good_width() || part->good_column())) { + dest_it.add_after_then_move(part->ShallowCopy()); + } + } + if (dest_it.empty()) { + return nullptr; + } + return new ColPartitionSet(©_parts); +} + +// Return the bounding boxes of columns at the given y-range +void ColPartitionSet::GetColumnBoxes(int y_bottom, int y_top, + ColSegment_LIST *segments) { + ColPartition_IT it(&parts_); + ColSegment_IT col_it(segments); + col_it.move_to_last(); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + ICOORD bot_left(part->LeftAtY(y_top), y_bottom); + ICOORD top_right(part->RightAtY(y_bottom), y_top); + auto *col_seg = new ColSegment(); + col_seg->InsertBox(TBOX(bot_left, top_right)); + col_it.add_after_then_move(col_seg); + } +} + +#ifndef GRAPHICS_DISABLED + +// Display the edges of the columns at the given y coords. +void ColPartitionSet::DisplayColumnEdges(int y_bottom, int y_top, + ScrollView *win) { + ColPartition_IT it(&parts_); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + win->Line(part->LeftAtY(y_top), y_top, part->LeftAtY(y_bottom), y_bottom); + win->Line(part->RightAtY(y_top), y_top, part->RightAtY(y_bottom), y_bottom); + } +} + +#endif // !GRAPHICS_DISABLED + +// Return the ColumnSpanningType that best explains the columns overlapped +// by the given coords(left,right,y), with the given margins. +// Also return the first and last column index touched by the coords and +// the leftmost spanned column. +// Column indices are 2n + 1 for real columns (0 based) and even values +// represent the gaps in between columns, with 0 being left of the leftmost. +// resolution refers to the ppi resolution of the image. +ColumnSpanningType ColPartitionSet::SpanningType( + int resolution, int left, int right, int height, int y, int left_margin, + int right_margin, int *first_col, int *last_col, int *first_spanned_col) { + *first_col = -1; + *last_col = -1; + *first_spanned_col = -1; + int margin_columns = 0; + ColPartition_IT it(&parts_); + int col_index = 1; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), col_index += 2) { + ColPartition *part = it.data(); + if (part->ColumnContains(left, y) || + (it.at_first() && part->ColumnContains(left + height, y))) { + // In the default case, first_col is set, but columns_spanned remains + // zero, so first_col will get reset in the first column genuinely + // spanned, but we can tell the difference from a noise partition + // that touches no column. + *first_col = col_index; + if (part->ColumnContains(right, y) || + (it.at_last() && part->ColumnContains(right - height, y))) { + // Both within a single column. + *last_col = col_index; + return CST_FLOWING; + } + if (left_margin <= part->LeftAtY(y)) { + // It completely spans this column. + *first_spanned_col = col_index; + margin_columns = 1; + } + } else if (part->ColumnContains(right, y) || + (it.at_last() && part->ColumnContains(right - height, y))) { + if (*first_col < 0) { + // It started in-between. + *first_col = col_index - 1; + } + if (right_margin >= part->RightAtY(y)) { + // It completely spans this column. + if (margin_columns == 0) { + *first_spanned_col = col_index; + } + ++margin_columns; + } + *last_col = col_index; + break; + } else if (left < part->LeftAtY(y) && right > part->RightAtY(y)) { + // Neither left nor right are contained within, so it spans this + // column. + if (*first_col < 0) { + // It started in between the previous column and the current column. + *first_col = col_index - 1; + } + if (margin_columns == 0) { + *first_spanned_col = col_index; + } + *last_col = col_index; + } else if (right < part->LeftAtY(y)) { + // We have gone past the end. + *last_col = col_index - 1; + if (*first_col < 0) { + // It must lie completely between columns =>noise. + *first_col = col_index - 1; + } + break; + } + } + if (*first_col < 0) { + *first_col = col_index - 1; // The last in-between. + } + if (*last_col < 0) { + *last_col = col_index - 1; // The last in-between. + } + ASSERT_HOST(*first_col >= 0 && *last_col >= 0); + ASSERT_HOST(*first_col <= *last_col); + if (*first_col == *last_col && right - left < kMinColumnWidth * resolution) { + // Neither end was in a column, and it didn't span any, so it lies + // entirely between columns, therefore noise. + return CST_NOISE; + } else if (margin_columns <= 1) { + // An exception for headings that stick outside of single-column text. + if (margin_columns == 1 && parts_.singleton()) { + return CST_HEADING; + } + // It is a pullout, as left and right were not in the same column, but + // it doesn't go to the edge of its start and end. + return CST_PULLOUT; + } + // Its margins went to the edges of first and last columns => heading. + return CST_HEADING; +} + +// The column_set has changed. Close down all in-progress WorkingPartSets in +// columns that do not match and start new ones for the new columns in this. +// As ColPartitions are turned into BLOCKs, the used ones are put in +// used_parts, as they still need to be referenced in the grid. +void ColPartitionSet::ChangeWorkColumns(const ICOORD &bleft, + const ICOORD &tright, int resolution, + ColPartition_LIST *used_parts, + WorkingPartSet_LIST *working_set_list) { + // Move the input list to a temporary location so we can delete its elements + // as we add them to the output working_set. + WorkingPartSet_LIST work_src; + WorkingPartSet_IT src_it(&work_src); + src_it.add_list_after(working_set_list); + src_it.move_to_first(); + WorkingPartSet_IT dest_it(working_set_list); + // Completed blocks and to_blocks are accumulated and given to the first new + // one whenever we keep a column, or at the end. + BLOCK_LIST completed_blocks; + TO_BLOCK_LIST to_blocks; + WorkingPartSet *first_new_set = nullptr; + WorkingPartSet *working_set = nullptr; + ColPartition_IT col_it(&parts_); + for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) { + ColPartition *column = col_it.data(); + // Any existing column to the left of column is completed. + while (!src_it.empty() && + ((working_set = src_it.data())->column() == nullptr || + working_set->column()->right_key() <= column->left_key())) { + src_it.extract(); + working_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts, + &completed_blocks, &to_blocks); + delete working_set; + src_it.forward(); + } + // Make a new between-column WorkingSet for before the current column. + working_set = new WorkingPartSet(nullptr); + dest_it.add_after_then_move(working_set); + if (first_new_set == nullptr) { + first_new_set = working_set; + } + // A matching column gets to stay, and first_new_set gets all the + // completed_sets. + working_set = src_it.empty() ? nullptr : src_it.data(); + if (working_set != nullptr && + working_set->column()->MatchingColumns(*column)) { + working_set->set_column(column); + dest_it.add_after_then_move(src_it.extract()); + src_it.forward(); + first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks); + first_new_set = nullptr; + } else { + // Just make a new working set for the current column. + working_set = new WorkingPartSet(column); + dest_it.add_after_then_move(working_set); + } + } + // Complete any remaining src working sets. + while (!src_it.empty()) { + working_set = src_it.extract(); + working_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts, + &completed_blocks, &to_blocks); + delete working_set; + src_it.forward(); + } + // Make a new between-column WorkingSet for after the last column. + working_set = new WorkingPartSet(nullptr); + dest_it.add_after_then_move(working_set); + if (first_new_set == nullptr) { + first_new_set = working_set; + } + // The first_new_set now gets any accumulated completed_parts/blocks. + first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks); +} + +// Accumulate the widths and gaps into the given variables. +void ColPartitionSet::AccumulateColumnWidthsAndGaps(int *total_width, + int *width_samples, + int *total_gap, + int *gap_samples) { + ColPartition_IT it(&parts_); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + *total_width += part->ColumnWidth(); + ++*width_samples; + if (!it.at_last()) { + ColPartition *next_part = it.data_relative(1); + int part_left = part->right_key(); + int part_right = next_part->left_key(); + int gap = part->KeyWidth(part_left, part_right); + *total_gap += gap; + ++*gap_samples; + } + } +} + +// Provide debug output for this ColPartitionSet and all the ColPartitions. +void ColPartitionSet::Print() { + ColPartition_IT it(&parts_); + tprintf( + "Partition set of %d parts, %d good, coverage=%d+%d" + " (%d,%d)->(%d,%d)\n", + it.length(), good_column_count_, good_coverage_, bad_coverage_, + bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(), + bounding_box_.top()); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + part->Print(); + } +} + +// PRIVATE CODE. + +// Add the given partition to the list in the appropriate place. +void ColPartitionSet::AddPartition(ColPartition *new_part, + ColPartition_IT *it) { + AddPartitionCoverageAndBox(*new_part); + int new_right = new_part->right_key(); + if (it->data()->left_key() >= new_right) { + it->add_before_stay_put(new_part); + } else { + it->add_after_stay_put(new_part); + } +} + +// Compute the coverage and good column count. Coverage is the amount of the +// width of the page (in pixels) that is covered by ColPartitions, which are +// used to provide candidate column layouts. +// Coverage is split into good and bad. Good coverage is provided by +// ColPartitions of a frequent width (according to the callback function +// provided by TabFinder::WidthCB, which accesses stored statistics on the +// widths of ColPartitions) and bad coverage is provided by all other +// ColPartitions, even if they have tab vectors at both sides. Thus: +// |-----------------------------------------------------------------| +// | Double width heading | +// |-----------------------------------------------------------------| +// |-------------------------------| |-------------------------------| +// | Common width ColPartition | | Common width ColPartition | +// |-------------------------------| |-------------------------------| +// the layout with two common-width columns has better coverage than the +// double width heading, because the coverage is "good," even though less in +// total coverage than the heading, because the heading coverage is "bad." +void ColPartitionSet::ComputeCoverage() { + // Count the number of good columns and sum their width. + ColPartition_IT it(&parts_); + good_column_count_ = 0; + good_coverage_ = 0; + bad_coverage_ = 0; + bounding_box_ = TBOX(); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColPartition *part = it.data(); + AddPartitionCoverageAndBox(*part); + } +} + +// Adds the coverage, column count and box for a single partition, +// without adding it to the list. (Helper factored from ComputeCoverage.) +void ColPartitionSet::AddPartitionCoverageAndBox(const ColPartition &part) { + bounding_box_ += part.bounding_box(); + int coverage = part.ColumnWidth(); + if (part.good_width()) { + good_coverage_ += coverage; + good_column_count_ += 2; + } else { + if (part.blob_type() < BRT_UNKNOWN) { + coverage /= 2; + } + if (part.good_column()) { + ++good_column_count_; + } + bad_coverage_ += coverage; + } +} + +} // namespace tesseract.
