diff mupdf-source/thirdparty/tesseract/src/textord/colfind.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/textord/colfind.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1641 @@
+///////////////////////////////////////////////////////////////////////
+// File:        colfind.cpp
+// Description: Class to hold BLOBNBOXs in a grid for fast access
+//              to neighbours.
+// Author:      Ray Smith
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "colfind.h"
+
+#include "ccnontextdetect.h"
+#include "colpartition.h"
+#include "colpartitionset.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "equationdetectbase.h"
+#endif
+#include "blobbox.h"
+#include "linefind.h"
+#include "normalis.h"
+#include "params.h"
+#include "scrollview.h"
+#include "strokewidth.h"
+#include "tablefind.h"
+#include "workingpartset.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+// When assigning columns, the max number of misfit grid rows/ColPartitionSets
+// that can be ignored.
+const int kMaxIncompatibleColumnCount = 2;
+// Max fraction of mean_column_gap_ for the gap between two partitions within a
+// column to allow them to merge.
+const double kHorizontalGapMergeFraction = 0.5;
+// Minimum gutter width as a fraction of gridsize
+const double kMinGutterWidthGrid = 0.5;
+// Max multiple of a partition's median size as a distance threshold for
+// adding noise blobs.
+const double kMaxDistToPartSizeRatio = 1.5;
+
+#ifndef GRAPHICS_DISABLED
+static BOOL_VAR(textord_tabfind_show_initial_partitions, false, "Show partition bounds");
+static BOOL_VAR(textord_tabfind_show_reject_blobs, false, "Show blobs rejected as noise");
+static INT_VAR(textord_tabfind_show_partitions, 0,
+               "Show partition bounds, waiting if >1 (ScrollView)");
+static BOOL_VAR(textord_tabfind_show_columns, false, "Show column bounds (ScrollView)");
+static BOOL_VAR(textord_tabfind_show_blocks, false, "Show final block bounds (ScrollView)");
+#endif
+static BOOL_VAR(textord_tabfind_find_tables, true, "run table detection");
+
+#ifndef GRAPHICS_DISABLED
+ScrollView *ColumnFinder::blocks_win_ = nullptr;
+#endif
+
+// Gridsize is an estimate of the text size in the image. A suitable value
+// is in TO_BLOCK::line_size after find_components has been used to make
+// the blobs.
+// bleft and tright are the bounds of the image (or rectangle) being processed.
+// vlines is a (possibly empty) list of TabVector and vertical_x and y are
+// the sum logical vertical vector produced by LineFinder::FindVerticalLines.
+ColumnFinder::ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution,
+                           bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines,
+                           TabVector_LIST *hlines, int vertical_x, int vertical_y)
+    : TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y, resolution)
+    , cjk_script_(cjk_script)
+    , min_gutter_width_(static_cast<int>(kMinGutterWidthGrid * gridsize))
+    , mean_column_gap_(tright.x() - bleft.x())
+    , tabfind_aligned_gap_fraction_(aligned_gap_fraction)
+    , deskew_(0.0f, 0.0f)
+    , reskew_(1.0f, 0.0f)
+    , rotation_(1.0f, 0.0f)
+    , rerotate_(1.0f, 0.0f)
+    , text_rotation_(0.0f, 0.0f)
+    , best_columns_(nullptr)
+    , stroke_width_(nullptr)
+    , part_grid_(gridsize, bleft, tright)
+    , nontext_map_(nullptr)
+    , projection_(resolution)
+    , denorm_(nullptr)
+    , equation_detect_(nullptr) {
+  TabVector_IT h_it(&horizontal_lines_);
+  h_it.add_list_after(hlines);
+}
+
+ColumnFinder::~ColumnFinder() {
+  for (auto set : column_sets_) {
+    delete set;
+  }
+  delete[] best_columns_;
+  delete stroke_width_;
+#ifndef GRAPHICS_DISABLED
+  delete input_blobs_win_;
+#endif
+  nontext_map_.destroy();
+  while (denorm_ != nullptr) {
+    DENORM *dead_denorm = denorm_;
+    denorm_ = const_cast<DENORM *>(denorm_->predecessor());
+    delete dead_denorm;
+  }
+
+  // The ColPartitions are destroyed automatically, but any boxes in
+  // the noise_parts_ list are owned and need to be deleted explicitly.
+  ColPartition_IT part_it(&noise_parts_);
+  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
+    ColPartition *part = part_it.data();
+    part->DeleteBoxes();
+  }
+  // Likewise any boxes in the good_parts_ list need to be deleted.
+  // These are just the image parts. Text parts have already given their
+  // boxes on to the TO_BLOCK, and have empty lists.
+  part_it.set_to_list(&good_parts_);
+  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
+    ColPartition *part = part_it.data();
+    part->DeleteBoxes();
+  }
+  // Also, any blobs on the image_bblobs_ list need to have their cblobs
+  // deleted. This only happens if there has been an early return from
+  // FindColumns, as in a normal return, the blobs go into the grid and
+  // end up in noise_parts_, good_parts_ or the output blocks.
+  BLOBNBOX_IT bb_it(&image_bblobs_);
+  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
+    BLOBNBOX *bblob = bb_it.data();
+    delete bblob->cblob();
+  }
+}
+
+// Performs initial processing on the blobs in the input_block:
+// Setup the part_grid_, stroke_width_, nontext_map.
+// Obvious noise blobs are filtered out and used to mark the nontext_map_.
+// Initial stroke-width analysis is used to get local text alignment
+// direction, so the textline projection_ map can be setup.
+// On return, IsVerticallyAlignedText may be called (now optionally) to
+// determine the gross textline alignment of the page.
+void ColumnFinder::SetupAndFilterNoise(PageSegMode pageseg_mode, Image photo_mask_pix,
+                                       TO_BLOCK *input_block) {
+  part_grid_.Init(gridsize(), bleft(), tright());
+  delete stroke_width_;
+  stroke_width_ = new StrokeWidth(gridsize(), bleft(), tright());
+  min_gutter_width_ = static_cast<int>(kMinGutterWidthGrid * gridsize());
+  input_block->ReSetAndReFilterBlobs();
+#ifndef GRAPHICS_DISABLED
+  if (textord_tabfind_show_blocks) {
+    input_blobs_win_ = MakeWindow(0, 0, "Filtered Input Blobs");
+    input_block->plot_graded_blobs(input_blobs_win_);
+  }
+#endif // !GRAPHICS_DISABLED
+  SetBlockRuleEdges(input_block);
+  nontext_map_.destroy();
+  // Run a preliminary strokewidth neighbour detection on the medium blobs.
+  stroke_width_->SetNeighboursOnMediumBlobs(input_block);
+  CCNonTextDetect nontext_detect(gridsize(), bleft(), tright());
+  // Remove obvious noise and make the initial non-text map.
+  nontext_map_ =
+      nontext_detect.ComputeNonTextMask(textord_debug_tabfind, photo_mask_pix, input_block);
+  stroke_width_->FindTextlineDirectionAndFixBrokenCJK(pageseg_mode, cjk_script_, input_block);
+  // Clear the strokewidth grid ready for rotation or leader finding.
+  stroke_width_->Clear();
+}
+
+// Tests for vertical alignment of text (returning true if so), and generates
+// a list of blobs of moderate aspect ratio, in the most frequent writing
+// direction (in osd_blobs) for orientation and script detection to test
+// the character orientation.
+// block is the single block for the whole page or rectangle to be OCRed.
+// Note that the vertical alignment may be due to text whose writing direction
+// is vertical, like say Japanese, or due to text whose writing direction is
+// horizontal but whose text appears vertically aligned because the image is
+// not the right way up.
+bool ColumnFinder::IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block,
+                                           BLOBNBOX_CLIST *osd_blobs) {
+  return stroke_width_->TestVerticalTextDirection(find_vertical_text_ratio, block, osd_blobs);
+}
+
+// Rotates the blobs and the TabVectors so that the gross writing direction
+// (text lines) are horizontal and lines are read down the page.
+// Applied rotation stored in rotation_.
+// A second rotation is calculated for application during recognition to
+// make the rotated blobs upright for recognition.
+// Subsequent rotation stored in text_rotation_.
+//
+// Arguments:
+//   vertical_text_lines true if the text lines are vertical.
+//   recognition_rotation [0..3] is the number of anti-clockwise 90 degree
+//   rotations from osd required for the text to be upright and readable.
+void ColumnFinder::CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines,
+                                      int recognition_rotation) {
+  const FCOORD anticlockwise90(0.0f, 1.0f);
+  const FCOORD clockwise90(0.0f, -1.0f);
+  const FCOORD rotation180(-1.0f, 0.0f);
+  const FCOORD norotation(1.0f, 0.0f);
+
+  text_rotation_ = norotation;
+  // Rotate the page to make the text upright, as implied by
+  // recognition_rotation.
+  rotation_ = norotation;
+  if (recognition_rotation == 1) {
+    rotation_ = anticlockwise90;
+  } else if (recognition_rotation == 2) {
+    rotation_ = rotation180;
+  } else if (recognition_rotation == 3) {
+    rotation_ = clockwise90;
+  }
+  // We infer text writing direction to be vertical if there are several
+  // vertical text lines detected, and horizontal if not. But if the page
+  // orientation was determined to be 90 or 270 degrees, the true writing
+  // direction is the opposite of what we inferred.
+  if (recognition_rotation & 1) {
+    vertical_text_lines = !vertical_text_lines;
+  }
+  // If we still believe the writing direction is vertical, we use the
+  // convention of rotating the page ccw 90 degrees to make the text lines
+  // horizontal, and mark the blobs for rotation cw 90 degrees for
+  // classification so that the text order is correct after recognition.
+  if (vertical_text_lines) {
+    rotation_.rotate(anticlockwise90);
+    text_rotation_.rotate(clockwise90);
+  }
+  // Set rerotate_ to the inverse of rotation_.
+  rerotate_ = FCOORD(rotation_.x(), -rotation_.y());
+  if (rotation_.x() != 1.0f || rotation_.y() != 0.0f) {
+    // Rotate all the blobs and tab vectors.
+    RotateBlobList(rotation_, &block->large_blobs);
+    RotateBlobList(rotation_, &block->blobs);
+    RotateBlobList(rotation_, &block->small_blobs);
+    RotateBlobList(rotation_, &block->noise_blobs);
+    TabFind::ResetForVerticalText(rotation_, rerotate_, &horizontal_lines_, &min_gutter_width_);
+    part_grid_.Init(gridsize(), bleft(), tright());
+    // Reset all blobs to initial state and filter by size.
+    // Since they have rotated, the list they belong on could have changed.
+    block->ReSetAndReFilterBlobs();
+    SetBlockRuleEdges(block);
+    stroke_width_->CorrectForRotation(rerotate_, &part_grid_);
+  }
+  if (textord_debug_tabfind) {
+    tprintf("Vertical=%d, orientation=%d, final rotation=(%f, %f)+(%f,%f)\n", vertical_text_lines,
+            recognition_rotation, rotation_.x(), rotation_.y(), text_rotation_.x(),
+            text_rotation_.y());
+  }
+  // Setup the denormalization.
+  ASSERT_HOST(denorm_ == nullptr);
+  denorm_ = new DENORM;
+  denorm_->SetupNormalization(nullptr, &rotation_, nullptr, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);
+}
+
+// Finds blocks of text, image, rule line, table etc, returning them in the
+// blocks and to_blocks
+// (Each TO_BLOCK points to the basic BLOCK and adds more information.)
+// Image blocks are generated by a combination of photo_mask_pix (which may
+// NOT be nullptr) and the rejected text found during preliminary textline
+// finding.
+// The input_block is the result of a call to find_components, and contains
+// the blobs found in the image or rectangle to be OCRed. These blobs will be
+// removed and placed in the output blocks, while unused ones will be deleted.
+// If single_column is true, the input is treated as single column, but
+// it is still divided into blocks of equal line spacing/text size.
+// scaled_color is scaled down by scaled_factor from the input color image,
+// and may be nullptr if the input was not color.
+// grey_pix is optional, but if present must match the photo_mask_pix in size,
+// and must be a *real* grey image instead of binary_pix * 255.
+// thresholds_pix is expected to be present iff grey_pix is present and
+// can be an integer factor reduction of the grey_pix. It represents the
+// thresholds that were used to create the binary_pix from the grey_pix.
+// If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+// confuse layout analysis by causing textline overlap, are placed there,
+// with the expectation that they will be reassigned to words later and
+// noise/diacriticness determined via classification.
+// Returns -1 if the user hits the 'd' key in the blocks window while running
+// in debug mode, which requests a retry with more debug info.
+int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Image scaled_color, int scaled_factor,
+                             TO_BLOCK *input_block, Image photo_mask_pix, Image thresholds_pix,
+                             Image grey_pix, DebugPixa *pixa_debug, BLOCK_LIST *blocks,
+                             BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks) {
+  photo_mask_pix |= nontext_map_;
+  stroke_width_->FindLeaderPartitions(input_block, &part_grid_);
+  stroke_width_->RemoveLineResidue(&big_parts_);
+  FindInitialTabVectors(nullptr, min_gutter_width_, tabfind_aligned_gap_fraction_, input_block);
+  SetBlockRuleEdges(input_block);
+  stroke_width_->GradeBlobsIntoPartitions(pageseg_mode, rerotate_, input_block, nontext_map_,
+                                          denorm_, cjk_script_, &projection_, diacritic_blobs,
+                                          &part_grid_, &big_parts_);
+  if (!PSM_SPARSE(pageseg_mode)) {
+    ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this,
+                                   pixa_debug, &part_grid_, &big_parts_);
+    ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_, photo_mask_pix);
+    ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this,
+                                   pixa_debug, &part_grid_, &big_parts_);
+  }
+  part_grid_.ReTypeBlobs(&image_bblobs_);
+  TidyBlobs(input_block);
+  Reset();
+  // TODO(rays) need to properly handle big_parts_.
+  ColPartition_IT p_it(&big_parts_);
+  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
+    p_it.data()->DisownBoxesNoAssert();
+  }
+  big_parts_.clear();
+  delete stroke_width_;
+  stroke_width_ = nullptr;
+  // Compute the edge offsets whether or not there is a grey_pix. It is done
+  // here as the c_blobs haven't been touched by rotation or anything yet,
+  // so no denorm is required, yet the text has been separated from image, so
+  // no time is wasted running it on image blobs.
+  input_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);
+
+  // A note about handling right-to-left scripts (Hebrew/Arabic):
+  // The columns must be reversed and come out in right-to-left instead of
+  // the normal left-to-right order. Because the left-to-right ordering
+  // is implicit in many data structures, it is simpler to fool the algorithms
+  // into thinking they are dealing with left-to-right text.
+  // To do this, we reflect the needed data in the y-axis and then reflect
+  // the blocks back after they have been created. This is a temporary
+  // arrangement that is confined to this function only, so the reflection
+  // is completely invisible in the output blocks.
+  // The only objects reflected are:
+  // The vertical separator lines that have already been found;
+  // The bounding boxes of all BLOBNBOXES on all lists on the input_block
+  // plus the image_bblobs. The outlines are not touched, since they are
+  // not looked at.
+  bool input_is_rtl = input_block->block->right_to_left();
+  if (input_is_rtl) {
+    // Reflect the vertical separator lines (member of TabFind).
+    ReflectInYAxis();
+    // Reflect the blob boxes.
+    ReflectForRtl(input_block, &image_bblobs_);
+    part_grid_.ReflectInYAxis();
+  }
+
+  if (!PSM_SPARSE(pageseg_mode)) {
+    if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
+      // No tab stops needed. Just the grid that FindTabVectors makes.
+      DontFindTabVectors(&image_bblobs_, input_block, &deskew_, &reskew_);
+    } else {
+      SetBlockRuleEdges(input_block);
+      // Find the tab stops, estimate skew, and deskew the tabs, blobs and
+      // part_grid_.
+      FindTabVectors(&horizontal_lines_, &image_bblobs_, input_block, min_gutter_width_,
+                     tabfind_aligned_gap_fraction_, &part_grid_, &deskew_, &reskew_);
+      // Add the deskew to the denorm_.
+      auto *new_denorm = new DENORM;
+      new_denorm->SetupNormalization(nullptr, &deskew_, denorm_, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f,
+                                     0.0f);
+      denorm_ = new_denorm;
+    }
+    SetBlockRuleEdges(input_block);
+    part_grid_.SetTabStops(this);
+
+    // Make the column_sets_.
+    if (!MakeColumns(false)) {
+      tprintf("Empty page!!\n");
+      part_grid_.DeleteParts();
+      return 0; // This is an empty page.
+    }
+
+    // Refill the grid using rectangular spreading, and get the benefit
+    // of the completed tab vectors marking the rule edges of each blob.
+    Clear();
+#ifndef GRAPHICS_DISABLED
+    if (textord_tabfind_show_reject_blobs) {
+      ScrollView *rej_win = MakeWindow(500, 300, "Rejected blobs");
+      input_block->plot_graded_blobs(rej_win);
+    }
+#endif // !GRAPHICS_DISABLED
+    InsertBlobsToGrid(false, false, &image_bblobs_, this);
+    InsertBlobsToGrid(true, true, &input_block->blobs, this);
+
+    part_grid_.GridFindMargins(best_columns_);
+    // Split and merge the partitions by looking at local neighbours.
+    GridSplitPartitions();
+    // Resolve unknown partitions by adding to an existing partition, fixing
+    // the type, or declaring them noise.
+    part_grid_.GridFindMargins(best_columns_);
+    GridMergePartitions();
+    // Insert any unused noise blobs that are close enough to an appropriate
+    // partition.
+    InsertRemainingNoise(input_block);
+    // Add horizontal line separators as partitions.
+    GridInsertHLinePartitions();
+    GridInsertVLinePartitions();
+    // Recompute margins based on a local neighbourhood search.
+    part_grid_.GridFindMargins(best_columns_);
+    SetPartitionTypes();
+  }
+#ifndef GRAPHICS_DISABLED
+  if (textord_tabfind_show_initial_partitions) {
+    ScrollView *part_win = MakeWindow(100, 300, "InitialPartitions");
+    part_grid_.DisplayBoxes(part_win);
+    DisplayTabVectors(part_win);
+  }
+#endif
+  if (!PSM_SPARSE(pageseg_mode)) {
+#ifndef DISABLED_LEGACY_ENGINE
+    if (equation_detect_) {
+      equation_detect_->FindEquationParts(&part_grid_, best_columns_);
+    }
+#endif
+    if (textord_tabfind_find_tables) {
+      TableFinder table_finder;
+      table_finder.Init(gridsize(), bleft(), tright());
+      table_finder.set_resolution(resolution_);
+      table_finder.set_left_to_right_language(!input_block->block->right_to_left());
+      // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
+      // insert dot-like noise into period_grid_
+      table_finder.InsertCleanPartitions(&part_grid_, input_block);
+      // Get Table Regions
+      table_finder.LocateTables(&part_grid_, best_columns_, WidthCB(), reskew_);
+    }
+    GridRemoveUnderlinePartitions();
+    part_grid_.DeleteUnknownParts(input_block);
+
+    // Build the partitions into chains that belong in the same block and
+    // refine into one-to-one links, then smooth the types within each chain.
+    part_grid_.FindPartitionPartners();
+    part_grid_.FindFigureCaptions();
+    part_grid_.RefinePartitionPartners(true);
+    SmoothPartnerRuns();
+
+#ifndef GRAPHICS_DISABLED
+    if (textord_tabfind_show_partitions) {
+      ScrollView *window = MakeWindow(400, 300, "Partitions");
+      if (window != nullptr) {
+        part_grid_.DisplayBoxes(window);
+        if (!textord_debug_printable) {
+          DisplayTabVectors(window);
+        }
+        if (window != nullptr && textord_tabfind_show_partitions > 1) {
+          window->AwaitEvent(SVET_DESTROY);
+        }
+      }
+    }
+#endif // !GRAPHICS_DISABLED
+    part_grid_.AssertNoDuplicates();
+  }
+  // Ownership of the ColPartitions moves from part_sets_ to part_grid_ here,
+  // and ownership of the BLOBNBOXes moves to the ColPartitions.
+  // (They were previously owned by the block or the image_bblobs list.)
+  ReleaseBlobsAndCleanupUnused(input_block);
+  // Ownership of the ColPartitions moves from part_grid_ to good_parts_ and
+  // noise_parts_ here. In text blocks, ownership of the BLOBNBOXes moves
+  // from the ColPartitions to the output TO_BLOCK. In non-text, the
+  // BLOBNBOXes stay with the ColPartitions and get deleted in the destructor.
+  if (PSM_SPARSE(pageseg_mode)) {
+    part_grid_.ExtractPartitionsAsBlocks(blocks, to_blocks);
+  } else {
+    TransformToBlocks(blocks, to_blocks);
+  }
+  if (textord_debug_tabfind) {
+    tprintf("Found %d blocks, %d to_blocks\n", blocks->length(), to_blocks->length());
+  }
+
+#ifndef GRAPHICS_DISABLED
+  if (textord_tabfind_show_blocks) {
+    DisplayBlocks(blocks);
+  }
+#endif
+  RotateAndReskewBlocks(input_is_rtl, to_blocks);
+  int result = 0;
+#ifndef GRAPHICS_DISABLED
+  if (blocks_win_ != nullptr) {
+    bool waiting = false;
+    do {
+      waiting = false;
+      auto event = blocks_win_->AwaitEvent(SVET_ANY);
+      if (event->type == SVET_INPUT && event->parameter != nullptr) {
+        if (*event->parameter == 'd') {
+          result = -1;
+        } else {
+          blocks->clear();
+        }
+      } else if (event->type == SVET_DESTROY) {
+        blocks_win_ = nullptr;
+      } else {
+        waiting = true;
+      }
+    } while (waiting);
+  }
+#endif // !GRAPHICS_DISABLED
+  return result;
+}
+
+// Get the rotation required to deskew, and its inverse rotation.
+void ColumnFinder::GetDeskewVectors(FCOORD *deskew, FCOORD *reskew) {
+  *reskew = reskew_;
+  *deskew = reskew_;
+  deskew->set_y(-deskew->y());
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+void ColumnFinder::SetEquationDetect(EquationDetectBase *detect) {
+  equation_detect_ = detect;
+}
+#endif
+
+//////////////// PRIVATE CODE /////////////////////////
+
+#ifndef GRAPHICS_DISABLED
+
+// Displays the blob and block bounding boxes in a window called Blocks.
+void ColumnFinder::DisplayBlocks(BLOCK_LIST *blocks) {
+  if (blocks_win_ == nullptr) {
+    blocks_win_ = MakeWindow(700, 300, "Blocks");
+  } else {
+    blocks_win_->Clear();
+  }
+  DisplayBoxes(blocks_win_);
+  BLOCK_IT block_it(blocks);
+  int serial = 1;
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    BLOCK *block = block_it.data();
+    block->pdblk.plot(blocks_win_, serial++,
+                      textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN);
+  }
+  blocks_win_->Update();
+}
+
+// Displays the column edges at each grid y coordinate defined by
+// best_columns_.
+void ColumnFinder::DisplayColumnBounds(PartSetVector *sets) {
+  ScrollView *col_win = MakeWindow(50, 300, "Columns");
+  DisplayBoxes(col_win);
+  col_win->Pen(textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN);
+  for (int i = 0; i < gridheight_; ++i) {
+    ColPartitionSet *columns = best_columns_[i];
+    if (columns != nullptr) {
+      columns->DisplayColumnEdges(i * gridsize_, (i + 1) * gridsize_, col_win);
+    }
+  }
+}
+
+#endif // !GRAPHICS_DISABLED
+
+// Sets up column_sets_ (the determined column layout at each horizontal
+// slice). Returns false if the page is empty.
+bool ColumnFinder::MakeColumns(bool single_column) {
+  // The part_sets_ are a temporary structure used during column creation,
+  // and is a vector of ColPartitionSets, representing ColPartitions found
+  // at horizontal slices through the page.
+  PartSetVector part_sets;
+  if (!single_column) {
+    if (!part_grid_.MakeColPartSets(&part_sets)) {
+      return false; // Empty page.
+    }
+    ASSERT_HOST(part_grid_.gridheight() == gridheight_);
+    // Try using only the good parts first.
+    bool good_only = true;
+    do {
+      for (int i = 0; i < gridheight_; ++i) {
+        ColPartitionSet *line_set = part_sets.at(i);
+        if (line_set != nullptr && line_set->LegalColumnCandidate()) {
+          ColPartitionSet *column_candidate = line_set->Copy(good_only);
+          if (column_candidate != nullptr) {
+            column_candidate->AddToColumnSetsIfUnique(&column_sets_, WidthCB());
+          }
+        }
+      }
+      good_only = !good_only;
+    } while (column_sets_.empty() && !good_only);
+    if (textord_debug_tabfind) {
+      PrintColumnCandidates("Column candidates");
+    }
+    // Improve the column candidates against themselves.
+    ImproveColumnCandidates(&column_sets_, &column_sets_);
+    if (textord_debug_tabfind) {
+      PrintColumnCandidates("Improved columns");
+    }
+    // Improve the column candidates using the part_sets_.
+    ImproveColumnCandidates(&part_sets, &column_sets_);
+  }
+  ColPartitionSet *single_column_set = part_grid_.MakeSingleColumnSet(WidthCB());
+  if (single_column_set != nullptr) {
+    // Always add the single column set as a backup even if not in
+    // single column mode.
+    single_column_set->AddToColumnSetsIfUnique(&column_sets_, WidthCB());
+  }
+  if (textord_debug_tabfind) {
+    PrintColumnCandidates("Final Columns");
+  }
+  bool has_columns = !column_sets_.empty();
+  if (has_columns) {
+    // Divide the page into sections of uniform column layout.
+    bool any_multi_column = AssignColumns(part_sets);
+#ifndef GRAPHICS_DISABLED
+    if (textord_tabfind_show_columns) {
+      DisplayColumnBounds(&part_sets);
+    }
+#endif
+    ComputeMeanColumnGap(any_multi_column);
+  }
+  for (auto line_set : part_sets) {
+    if (line_set != nullptr) {
+      line_set->RelinquishParts();
+      delete line_set;
+    }
+  }
+  return has_columns;
+}
+
+// Attempt to improve the column_candidates by expanding the columns
+// and adding new partitions from the partition sets in src_sets.
+// Src_sets may be equal to column_candidates, in which case it will
+// use them as a source to improve themselves.
+void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets) {
+  // TODO: optimize.
+  PartSetVector temp_cols = *column_sets;
+  column_sets->clear();
+  if (src_sets == column_sets) {
+    src_sets = &temp_cols;
+  }
+  int set_size = temp_cols.size();
+  // Try using only the good parts first.
+  bool good_only = true;
+  do {
+    for (int i = 0; i < set_size; ++i) {
+      ColPartitionSet *column_candidate = temp_cols.at(i);
+      ASSERT_HOST(column_candidate != nullptr);
+      ColPartitionSet *improved = column_candidate->Copy(good_only);
+      if (improved != nullptr) {
+        improved->ImproveColumnCandidate(WidthCB(), src_sets);
+        improved->AddToColumnSetsIfUnique(column_sets, WidthCB());
+      }
+    }
+    good_only = !good_only;
+  } while (column_sets->empty() && !good_only);
+  if (column_sets->empty()) {
+    // TODO: optimize.
+    *column_sets = temp_cols;
+    temp_cols.clear();
+  } else {
+    for (auto data : temp_cols) {
+      delete data;
+    }
+  }
+}
+
+// Prints debug information on the column candidates.
+void ColumnFinder::PrintColumnCandidates(const char *title) {
+  int set_size = column_sets_.size();
+  tprintf("Found %d %s:\n", set_size, title);
+  if (textord_debug_tabfind >= 3) {
+    for (int i = 0; i < set_size; ++i) {
+      ColPartitionSet *column_set = column_sets_.at(i);
+      column_set->Print();
+    }
+  }
+}
+
+// Finds the optimal set of columns that cover the entire image with as
+// few changes in column partition as possible.
+// NOTE: this could be thought of as an optimization problem, but a simple
+// greedy algorithm is used instead. The algorithm repeatedly finds the modal
+// compatible column in an unassigned region and uses that with the extra
+// tweak of extending the modal region over small breaks in compatibility.
+// Where modal regions overlap, the boundary is chosen so as to minimize
+// the cost in terms of ColPartitions not fitting an approved column.
+// Returns true if any part of the page is multi-column.
+bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
+  int set_count = part_sets.size();
+  ASSERT_HOST(set_count == gridheight());
+  // Allocate and init the best_columns_.
+  best_columns_ = new ColPartitionSet *[set_count];
+  for (int y = 0; y < set_count; ++y) {
+    best_columns_[y] = nullptr;
+  }
+  int column_count = column_sets_.size();
+  // column_set_costs[part_sets_ index][column_sets_ index] is
+  // < INT32_MAX if the partition set is compatible with the column set,
+  // in which case its value is the cost for that set used in deciding
+  // which competing set to assign.
+  // any_columns_possible[part_sets_ index] is true if any of
+  // possible_column_sets[part_sets_ index][*] is < INT32_MAX.
+  // assigned_costs[part_sets_ index] is set to the column_set_costs
+  // of the assigned column_sets_ index or INT32_MAX if none is set.
+  // On return the best_columns_ member is set.
+  bool *any_columns_possible = new bool[set_count];
+  int *assigned_costs = new int[set_count];
+  int **column_set_costs = new int *[set_count];
+  // Set possible column_sets to indicate whether each set is compatible
+  // with each column.
+  for (int part_i = 0; part_i < set_count; ++part_i) {
+    ColPartitionSet *line_set = part_sets.at(part_i);
+    bool debug = line_set != nullptr && WithinTestRegion(2, line_set->bounding_box().left(),
+                                                         line_set->bounding_box().bottom());
+    column_set_costs[part_i] = new int[column_count];
+    any_columns_possible[part_i] = false;
+    assigned_costs[part_i] = INT32_MAX;
+    for (int col_i = 0; col_i < column_count; ++col_i) {
+      if (line_set != nullptr &&
+          column_sets_.at(col_i)->CompatibleColumns(debug, line_set, WidthCB())) {
+        column_set_costs[part_i][col_i] = column_sets_.at(col_i)->UnmatchedWidth(line_set);
+        any_columns_possible[part_i] = true;
+      } else {
+        column_set_costs[part_i][col_i] = INT32_MAX;
+        if (debug) {
+          tprintf("Set id %d did not match at y=%d, lineset =%p\n",
+                  col_i, part_i, static_cast<void *>(line_set));
+        }
+      }
+    }
+  }
+  bool any_multi_column = false;
+  // Assign a column set to each vertical grid position.
+  // While there is an unassigned range, find its mode.
+  int start, end;
+  while (BiggestUnassignedRange(set_count, any_columns_possible, &start, &end)) {
+    if (textord_debug_tabfind >= 2) {
+      tprintf("Biggest unassigned range = %d- %d\n", start, end);
+    }
+    // Find the modal column_set_id in the range.
+    int column_set_id = RangeModalColumnSet(column_set_costs, assigned_costs, start, end);
+    if (textord_debug_tabfind >= 2) {
+      tprintf("Range modal column id = %d\n", column_set_id);
+      column_sets_.at(column_set_id)->Print();
+    }
+    // Now find the longest run of the column_set_id in the range.
+    ShrinkRangeToLongestRun(column_set_costs, assigned_costs, any_columns_possible, column_set_id,
+                            &start, &end);
+    if (textord_debug_tabfind >= 2) {
+      tprintf("Shrunk range = %d- %d\n", start, end);
+    }
+    // Extend the start and end past the longest run, while there are
+    // only small gaps in compatibility that can be overcome by larger
+    // regions of compatibility beyond.
+    ExtendRangePastSmallGaps(column_set_costs, assigned_costs, any_columns_possible, column_set_id,
+                             -1, -1, &start);
+    --end;
+    ExtendRangePastSmallGaps(column_set_costs, assigned_costs, any_columns_possible, column_set_id,
+                             1, set_count, &end);
+    ++end;
+    if (textord_debug_tabfind) {
+      tprintf("Column id %d applies to range = %d - %d\n", column_set_id, start, end);
+    }
+    // Assign the column to the range, which now may overlap with other ranges.
+    AssignColumnToRange(column_set_id, start, end, column_set_costs, assigned_costs);
+    if (column_sets_.at(column_set_id)->GoodColumnCount() > 1) {
+      any_multi_column = true;
+    }
+  }
+  // If anything remains unassigned, the whole lot is unassigned, so
+  // arbitrarily assign id 0.
+  if (best_columns_[0] == nullptr) {
+    AssignColumnToRange(0, 0, gridheight_, column_set_costs, assigned_costs);
+  }
+  // Free memory.
+  for (int i = 0; i < set_count; ++i) {
+    delete[] column_set_costs[i];
+  }
+  delete[] assigned_costs;
+  delete[] any_columns_possible;
+  delete[] column_set_costs;
+  return any_multi_column;
+}
+
+// Finds the biggest range in part_sets_ that has no assigned column, but
+// column assignment is possible.
+bool ColumnFinder::BiggestUnassignedRange(int set_count, const bool *any_columns_possible,
+                                          int *best_start, int *best_end) {
+  int best_range_size = 0;
+  *best_start = set_count;
+  *best_end = set_count;
+  int end = set_count;
+  for (int start = 0; start < gridheight_; start = end) {
+    // Find the first unassigned index in start.
+    while (start < set_count) {
+      if (best_columns_[start] == nullptr && any_columns_possible[start]) {
+        break;
+      }
+      ++start;
+    }
+    // Find the first past the end and count the good ones in between.
+    int range_size = 1; // Number of non-null, but unassigned line sets.
+    end = start + 1;
+    while (end < set_count) {
+      if (best_columns_[end] != nullptr) {
+        break;
+      }
+      if (any_columns_possible[end]) {
+        ++range_size;
+      }
+      ++end;
+    }
+    if (start < set_count && range_size > best_range_size) {
+      best_range_size = range_size;
+      *best_start = start;
+      *best_end = end;
+    }
+  }
+  return *best_start < *best_end;
+}
+
+// Finds the modal compatible column_set_ index within the given range.
+int ColumnFinder::RangeModalColumnSet(int **column_set_costs, const int *assigned_costs, int start,
+                                      int end) {
+  int column_count = column_sets_.size();
+  STATS column_stats(0, column_count - 1);
+  for (int part_i = start; part_i < end; ++part_i) {
+    for (int col_j = 0; col_j < column_count; ++col_j) {
+      if (column_set_costs[part_i][col_j] < assigned_costs[part_i]) {
+        column_stats.add(col_j, 1);
+      }
+    }
+  }
+  ASSERT_HOST(column_stats.get_total() > 0);
+  return column_stats.mode();
+}
+
+// Given that there are many column_set_id compatible columns in the range,
+// shrinks the range to the longest contiguous run of compatibility, allowing
+// gaps where no columns are possible, but not where competing columns are
+// possible.
+void ColumnFinder::ShrinkRangeToLongestRun(int **column_set_costs, const int *assigned_costs,
+                                           const bool *any_columns_possible, int column_set_id,
+                                           int *best_start, int *best_end) {
+  // orig_start and orig_end are the maximum range we will look at.
+  int orig_start = *best_start;
+  int orig_end = *best_end;
+  int best_range_size = 0;
+  *best_start = orig_end;
+  *best_end = orig_end;
+  int end = orig_end;
+  for (int start = orig_start; start < orig_end; start = end) {
+    // Find the first possible
+    while (start < orig_end) {
+      if (column_set_costs[start][column_set_id] < assigned_costs[start] ||
+          !any_columns_possible[start]) {
+        break;
+      }
+      ++start;
+    }
+    // Find the first past the end.
+    end = start + 1;
+    while (end < orig_end) {
+      if (column_set_costs[end][column_set_id] >= assigned_costs[start] &&
+          any_columns_possible[end]) {
+        break;
+      }
+      ++end;
+    }
+    if (start < orig_end && end - start > best_range_size) {
+      best_range_size = end - start;
+      *best_start = start;
+      *best_end = end;
+    }
+  }
+}
+
+// Moves start in the direction of step, up to, but not including end while
+// the only incompatible regions are no more than kMaxIncompatibleColumnCount
+// in size, and the compatible regions beyond are bigger.
+void ColumnFinder::ExtendRangePastSmallGaps(int **column_set_costs, const int *assigned_costs,
+                                            const bool *any_columns_possible, int column_set_id,
+                                            int step, int end, int *start) {
+  if (textord_debug_tabfind > 2) {
+    tprintf("Starting expansion at %d, step=%d, limit=%d\n", *start, step, end);
+  }
+  if (*start == end) {
+    return; // Cannot be expanded.
+  }
+
+  int barrier_size = 0;
+  int good_size = 0;
+  do {
+    // Find the size of the incompatible barrier.
+    barrier_size = 0;
+    int i;
+    for (i = *start + step; i != end; i += step) {
+      if (column_set_costs[i][column_set_id] < assigned_costs[i]) {
+        break; // We are back on.
+      }
+      // Locations where none are possible don't count.
+      if (any_columns_possible[i]) {
+        ++barrier_size;
+      }
+    }
+    if (textord_debug_tabfind > 2) {
+      tprintf("At %d, Barrier size=%d\n", i, barrier_size);
+    }
+    if (barrier_size > kMaxIncompatibleColumnCount) {
+      return; // Barrier too big.
+    }
+    if (i == end) {
+      // We can't go any further, but the barrier was small, so go to the end.
+      *start = i - step;
+      return;
+    }
+    // Now find the size of the good region on the other side.
+    good_size = 1;
+    for (i += step; i != end; i += step) {
+      if (column_set_costs[i][column_set_id] < assigned_costs[i]) {
+        ++good_size;
+      } else if (any_columns_possible[i]) {
+        break;
+      }
+    }
+    if (textord_debug_tabfind > 2) {
+      tprintf("At %d, good size = %d\n", i, good_size);
+    }
+    // If we had enough good ones we can extend the start and keep looking.
+    if (good_size >= barrier_size) {
+      *start = i - step;
+    }
+  } while (good_size >= barrier_size);
+}
+
+// Assigns the given column_set_id to the given range.
+void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
+                                       int **column_set_costs, int *assigned_costs) {
+  ColPartitionSet *column_set = column_sets_.at(column_set_id);
+  for (int i = start; i < end; ++i) {
+    assigned_costs[i] = column_set_costs[i][column_set_id];
+    best_columns_[i] = column_set;
+  }
+}
+
+// Computes the mean_column_gap_.
+void ColumnFinder::ComputeMeanColumnGap(bool any_multi_column) {
+  int total_gap = 0;
+  int total_width = 0;
+  int gap_samples = 0;
+  int width_samples = 0;
+  for (int i = 0; i < gridheight_; ++i) {
+    ASSERT_HOST(best_columns_[i] != nullptr);
+    best_columns_[i]->AccumulateColumnWidthsAndGaps(&total_width, &width_samples, &total_gap,
+                                                    &gap_samples);
+  }
+  mean_column_gap_ = any_multi_column && gap_samples > 0
+                         ? total_gap / gap_samples
+                         : width_samples > 0 ? total_width / width_samples : 0;
+}
+
+//////// Functions that manipulate ColPartitions in the part_grid_ /////
+//////// to split, merge, find margins, and find types.  //////////////
+
+// Helper to delete all the deletable blobs on the list. Owned blobs are
+// extracted from the list, but not deleted, leaving them owned by the owner().
+static void ReleaseAllBlobsAndDeleteUnused(BLOBNBOX_LIST *blobs) {
+  for (BLOBNBOX_IT blob_it(blobs); !blob_it.empty(); blob_it.forward()) {
+    BLOBNBOX *blob = blob_it.extract();
+    if (blob->owner() == nullptr) {
+      delete blob;
+    }
+  }
+}
+
+// Hoovers up all un-owned blobs and deletes them.
+// The rest get released from the block so the ColPartitions can pass
+// ownership to the output blocks.
+void ColumnFinder::ReleaseBlobsAndCleanupUnused(TO_BLOCK *block) {
+  ReleaseAllBlobsAndDeleteUnused(&block->blobs);
+  ReleaseAllBlobsAndDeleteUnused(&block->small_blobs);
+  ReleaseAllBlobsAndDeleteUnused(&block->noise_blobs);
+  ReleaseAllBlobsAndDeleteUnused(&block->large_blobs);
+  ReleaseAllBlobsAndDeleteUnused(&image_bblobs_);
+}
+
+// Splits partitions that cross columns where they have nothing in the gap.
+void ColumnFinder::GridSplitPartitions() {
+  // Iterate the ColPartitions in the grid.
+  ColPartitionGridSearch gsearch(&part_grid_);
+  gsearch.StartFullSearch();
+  ColPartition *dont_repeat = nullptr;
+  ColPartition *part;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    if (part->blob_type() < BRT_UNKNOWN || part == dont_repeat) {
+      continue; // Only applies to text partitions.
+    }
+    ColPartitionSet *column_set = best_columns_[gsearch.GridY()];
+    int first_col = -1;
+    int last_col = -1;
+    // Find which columns the partition spans.
+    part->ColumnRange(resolution_, column_set, &first_col, &last_col);
+    if (first_col > 0) {
+      --first_col;
+    }
+    // Convert output column indices to physical column indices.
+    first_col /= 2;
+    last_col /= 2;
+    // We will only consider cases where a partition spans two columns,
+    // since a heading that spans more columns than that is most likely
+    // genuine.
+    if (last_col != first_col + 1) {
+      continue;
+    }
+    // Set up a rectangle search x-bounded by the column gap and y by the part.
+    int y = part->MidY();
+    TBOX margin_box = part->bounding_box();
+    bool debug = AlignedBlob::WithinTestRegion(2, margin_box.left(), margin_box.bottom());
+    if (debug) {
+      tprintf("Considering partition for GridSplit:");
+      part->Print();
+    }
+    ColPartition *column = column_set->GetColumnByIndex(first_col);
+    if (column == nullptr) {
+      continue;
+    }
+    margin_box.set_left(column->RightAtY(y) + 2);
+    column = column_set->GetColumnByIndex(last_col);
+    if (column == nullptr) {
+      continue;
+    }
+    margin_box.set_right(column->LeftAtY(y) - 2);
+    // TODO(rays) Decide whether to keep rectangular filling or not in the
+    // main grid and therefore whether we need a fancier search here.
+    // Now run the rect search on the main blob grid.
+    GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(this);
+    if (debug) {
+      tprintf("Searching box (%d,%d)->(%d,%d)\n", margin_box.left(), margin_box.bottom(),
+              margin_box.right(), margin_box.top());
+      part->Print();
+    }
+    rectsearch.StartRectSearch(margin_box);
+    BLOBNBOX *bbox;
+    while ((bbox = rectsearch.NextRectSearch()) != nullptr) {
+      if (bbox->bounding_box().overlap(margin_box)) {
+        break;
+      }
+    }
+    if (bbox == nullptr) {
+      // There seems to be nothing in the hole, so split the partition.
+      gsearch.RemoveBBox();
+      int x_middle = (margin_box.left() + margin_box.right()) / 2;
+      if (debug) {
+        tprintf("Splitting part at %d:", x_middle);
+        part->Print();
+      }
+      ColPartition *split_part = part->SplitAt(x_middle);
+      if (split_part != nullptr) {
+        if (debug) {
+          tprintf("Split result:");
+          part->Print();
+          split_part->Print();
+        }
+        part_grid_.InsertBBox(true, true, split_part);
+      } else {
+        // Split had no effect
+        if (debug) {
+          tprintf("Split had no effect\n");
+        }
+        dont_repeat = part;
+      }
+      part_grid_.InsertBBox(true, true, part);
+      gsearch.RepositionIterator();
+    } else if (debug) {
+      tprintf("Part cannot be split: blob (%d,%d)->(%d,%d) in column gap\n",
+              bbox->bounding_box().left(), bbox->bounding_box().bottom(),
+              bbox->bounding_box().right(), bbox->bounding_box().top());
+    }
+  }
+}
+
+// Merges partitions where there is vertical overlap, within a single column,
+// and the horizontal gap is small enough.
+void ColumnFinder::GridMergePartitions() {
+  // Iterate the ColPartitions in the grid.
+  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(&part_grid_);
+  gsearch.StartFullSearch();
+  ColPartition *part;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    if (part->IsUnMergeableType()) {
+      continue;
+    }
+    // Set up a rectangle search x-bounded by the column and y by the part.
+    ColPartitionSet *columns = best_columns_[gsearch.GridY()];
+    TBOX box = part->bounding_box();
+    bool debug = AlignedBlob::WithinTestRegion(1, box.left(), box.bottom());
+    if (debug) {
+      tprintf("Considering part for merge at:");
+      part->Print();
+    }
+    int y = part->MidY();
+    ColPartition *left_column = columns->ColumnContaining(box.left(), y);
+    ColPartition *right_column = columns->ColumnContaining(box.right(), y);
+    if (left_column == nullptr || right_column != left_column) {
+      if (debug) {
+        tprintf("In different columns\n");
+      }
+      continue;
+    }
+    box.set_left(left_column->LeftAtY(y));
+    box.set_right(right_column->RightAtY(y));
+    // Now run the rect search.
+    bool modified_box = false;
+    GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rsearch(&part_grid_);
+    rsearch.SetUniqueMode(true);
+    rsearch.StartRectSearch(box);
+    ColPartition *neighbour;
+
+    while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
+      if (neighbour == part || neighbour->IsUnMergeableType()) {
+        continue;
+      }
+      const TBOX &neighbour_box = neighbour->bounding_box();
+      if (debug) {
+        tprintf("Considering merge with neighbour at:");
+        neighbour->Print();
+      }
+      if (neighbour_box.right() < box.left() || neighbour_box.left() > box.right()) {
+        continue; // Not within the same column.
+      }
+      if (part->VSignificantCoreOverlap(*neighbour) && part->TypesMatch(*neighbour)) {
+        // There is vertical overlap and the gross types match, but only
+        // merge if the horizontal gap is small enough, as one of the
+        // partitions may be a figure caption within a column.
+        // If there is only one column, then the mean_column_gap_ is large
+        // enough to allow almost any merge, by being the mean column width.
+        const TBOX &part_box = part->bounding_box();
+        // Don't merge if there is something else in the way. Use the margin
+        // to decide, and check both to allow a bit of overlap.
+        if (neighbour_box.left() > part->right_margin() &&
+            part_box.right() < neighbour->left_margin()) {
+          continue; // Neighbour is too far to the right.
+        }
+        if (neighbour_box.right() < part->left_margin() &&
+            part_box.left() > neighbour->right_margin()) {
+          continue; // Neighbour is too far to the left.
+        }
+        int h_gap = std::max(part_box.left(), neighbour_box.left()) -
+                    std::min(part_box.right(), neighbour_box.right());
+        if (h_gap < mean_column_gap_ * kHorizontalGapMergeFraction ||
+            part_box.width() < mean_column_gap_ || neighbour_box.width() < mean_column_gap_) {
+          if (debug) {
+            tprintf("Running grid-based merge between:\n");
+            part->Print();
+            neighbour->Print();
+          }
+          rsearch.RemoveBBox();
+          if (!modified_box) {
+            // We are going to modify part, so remove it and re-insert it after.
+            gsearch.RemoveBBox();
+            rsearch.RepositionIterator();
+            modified_box = true;
+          }
+          part->Absorb(neighbour, WidthCB());
+        } else if (debug) {
+          tprintf("Neighbour failed hgap test\n");
+        }
+      } else if (debug) {
+        tprintf("Neighbour failed overlap or typesmatch test\n");
+      }
+    }
+    if (modified_box) {
+      // We modified the box of part, so re-insert it into the grid.
+      // This does no harm in the current cell, as it already exists there,
+      // but it needs to exist in all the cells covered by its bounding box,
+      // or it will never be found by a full search.
+      // Because the box has changed, it has to be removed first, otherwise
+      // add_sorted may fail to keep a single copy of the pointer.
+      part_grid_.InsertBBox(true, true, part);
+      gsearch.RepositionIterator();
+    }
+  }
+}
+
+// Inserts remaining noise blobs into the most applicable partition if any.
+// If there is no applicable partition, then the blobs are deleted.
+void ColumnFinder::InsertRemainingNoise(TO_BLOCK *block) {
+  BLOBNBOX_IT blob_it(&block->noise_blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX *blob = blob_it.data();
+    if (blob->owner() != nullptr) {
+      continue;
+    }
+    TBOX search_box(blob->bounding_box());
+    bool debug = WithinTestRegion(2, search_box.left(), search_box.bottom());
+    search_box.pad(gridsize(), gridsize());
+    // Setup a rectangle search to find the best partition to merge with.
+    ColPartitionGridSearch rsearch(&part_grid_);
+    rsearch.SetUniqueMode(true);
+    rsearch.StartRectSearch(search_box);
+    ColPartition *part;
+    ColPartition *best_part = nullptr;
+    int best_distance = 0;
+    while ((part = rsearch.NextRectSearch()) != nullptr) {
+      if (part->IsUnMergeableType()) {
+        continue;
+      }
+      int distance =
+          projection_.DistanceOfBoxFromPartition(blob->bounding_box(), *part, denorm_, debug);
+      if (best_part == nullptr || distance < best_distance) {
+        best_part = part;
+        best_distance = distance;
+      }
+    }
+    if (best_part != nullptr &&
+        best_distance < kMaxDistToPartSizeRatio * best_part->median_height()) {
+      // Close enough to merge.
+      if (debug) {
+        tprintf("Adding noise blob with distance %d, thr=%g:box:", best_distance,
+                kMaxDistToPartSizeRatio * best_part->median_height());
+        blob->bounding_box().print();
+        tprintf("To partition:");
+        best_part->Print();
+      }
+      part_grid_.RemoveBBox(best_part);
+      best_part->AddBox(blob);
+      part_grid_.InsertBBox(true, true, best_part);
+      blob->set_owner(best_part);
+      blob->set_flow(best_part->flow());
+      blob->set_region_type(best_part->blob_type());
+    } else {
+      // Mark the blob for deletion.
+      blob->set_region_type(BRT_NOISE);
+    }
+  }
+  // Delete the marked blobs, clearing neighbour references.
+  block->DeleteUnownedNoise();
+}
+
+// Helper makes a box from a horizontal line.
+static TBOX BoxFromHLine(const TabVector *hline) {
+  int top = std::max(hline->startpt().y(), hline->endpt().y());
+  int bottom = std::min(hline->startpt().y(), hline->endpt().y());
+  top += hline->mean_width();
+  if (top == bottom) {
+    if (bottom > 0) {
+      --bottom;
+    } else {
+      ++top;
+    }
+  }
+  return TBOX(hline->startpt().x(), bottom, hline->endpt().x(), top);
+}
+
+// Remove partitions that come from horizontal lines that look like
+// underlines, but are not part of a table.
+void ColumnFinder::GridRemoveUnderlinePartitions() {
+  TabVector_IT hline_it(&horizontal_lines_);
+  for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
+    TabVector *hline = hline_it.data();
+    if (hline->intersects_other_lines()) {
+      continue;
+    }
+    TBOX line_box = BoxFromHLine(hline);
+    TBOX search_box = line_box;
+    search_box.pad(0, line_box.height());
+    ColPartitionGridSearch part_search(&part_grid_);
+    part_search.SetUniqueMode(true);
+    part_search.StartRectSearch(search_box);
+    ColPartition *covered;
+    bool touched_table = false;
+    bool touched_text = false;
+    ColPartition *line_part = nullptr;
+    while ((covered = part_search.NextRectSearch()) != nullptr) {
+      if (covered->type() == PT_TABLE) {
+        touched_table = true;
+        break;
+      } else if (covered->IsTextType()) {
+        // TODO(rays) Add a list of underline sections to ColPartition.
+        int text_bottom = covered->median_bottom();
+        if (line_box.bottom() <= text_bottom && text_bottom <= search_box.top()) {
+          touched_text = true;
+        }
+      } else if (covered->blob_type() == BRT_HLINE && line_box.contains(covered->bounding_box()) &&
+                 // not if same instance (identical to hline)
+                 !TBOX(covered->bounding_box()).contains(line_box)) {
+        line_part = covered;
+      }
+    }
+    if (line_part != nullptr && !touched_table && touched_text) {
+      part_grid_.RemoveBBox(line_part);
+      delete line_part;
+    }
+  }
+}
+
+// Add horizontal line separators as partitions.
+void ColumnFinder::GridInsertHLinePartitions() {
+  TabVector_IT hline_it(&horizontal_lines_);
+  for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
+    TabVector *hline = hline_it.data();
+    TBOX line_box = BoxFromHLine(hline);
+    ColPartition *part =
+        ColPartition::MakeLinePartition(BRT_HLINE, vertical_skew_, line_box.left(),
+                                        line_box.bottom(), line_box.right(), line_box.top());
+    part->set_type(PT_HORZ_LINE);
+    bool any_image = false;
+    ColPartitionGridSearch part_search(&part_grid_);
+    part_search.SetUniqueMode(true);
+    part_search.StartRectSearch(line_box);
+    ColPartition *covered;
+    while ((covered = part_search.NextRectSearch()) != nullptr) {
+      if (covered->IsImageType()) {
+        any_image = true;
+        break;
+      }
+    }
+    if (!any_image) {
+      part_grid_.InsertBBox(true, true, part);
+    } else {
+      delete part;
+    }
+  }
+}
+
+// Add horizontal line separators as partitions.
+void ColumnFinder::GridInsertVLinePartitions() {
+  TabVector_IT vline_it(dead_vectors());
+  for (vline_it.mark_cycle_pt(); !vline_it.cycled_list(); vline_it.forward()) {
+    TabVector *vline = vline_it.data();
+    if (!vline->IsSeparator()) {
+      continue;
+    }
+    int left = std::min(vline->startpt().x(), vline->endpt().x());
+    int right = std::max(vline->startpt().x(), vline->endpt().x());
+    right += vline->mean_width();
+    if (left == right) {
+      if (left > 0) {
+        --left;
+      } else {
+        ++right;
+      }
+    }
+    ColPartition *part = ColPartition::MakeLinePartition(
+        BRT_VLINE, vertical_skew_, left, vline->startpt().y(), right, vline->endpt().y());
+    part->set_type(PT_VERT_LINE);
+    bool any_image = false;
+    ColPartitionGridSearch part_search(&part_grid_);
+    part_search.SetUniqueMode(true);
+    part_search.StartRectSearch(part->bounding_box());
+    ColPartition *covered;
+    while ((covered = part_search.NextRectSearch()) != nullptr) {
+      if (covered->IsImageType()) {
+        any_image = true;
+        break;
+      }
+    }
+    if (!any_image) {
+      part_grid_.InsertBBox(true, true, part);
+    } else {
+      delete part;
+    }
+  }
+}
+
+// For every ColPartition in the grid, sets its type based on position
+// in the columns.
+void ColumnFinder::SetPartitionTypes() {
+  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(&part_grid_);
+  gsearch.StartFullSearch();
+  ColPartition *part;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    part->SetPartitionType(resolution_, best_columns_[gsearch.GridY()]);
+  }
+}
+
+// Only images remain with multiple types in a run of partners.
+// Sets the type of all in the group to the maximum of the group.
+void ColumnFinder::SmoothPartnerRuns() {
+  // Iterate the ColPartitions in the grid.
+  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(&part_grid_);
+  gsearch.StartFullSearch();
+  ColPartition *part;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    ColPartition *partner = part->SingletonPartner(true);
+    if (partner != nullptr) {
+      if (partner->SingletonPartner(false) != part) {
+        tprintf("Ooops! Partition:(%d partners)", part->upper_partners()->length());
+        part->Print();
+        tprintf("has singleton partner:(%d partners", partner->lower_partners()->length());
+        partner->Print();
+        tprintf("but its singleton partner is:");
+        if (partner->SingletonPartner(false) == nullptr) {
+          tprintf("NULL\n");
+        } else {
+          partner->SingletonPartner(false)->Print();
+        }
+      }
+      ASSERT_HOST(partner->SingletonPartner(false) == part);
+    } else if (part->SingletonPartner(false) != nullptr) {
+      ColPartitionSet *column_set = best_columns_[gsearch.GridY()];
+      int column_count = column_set->ColumnCount();
+      part->SmoothPartnerRun(column_count * 2 + 1);
+    }
+  }
+}
+
+// Helper functions for TransformToBlocks.
+// Add the part to the temp list in the correct order.
+void ColumnFinder::AddToTempPartList(ColPartition *part, ColPartition_CLIST *temp_list) {
+  int mid_y = part->MidY();
+  ColPartition_C_IT it(temp_list);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    ColPartition *test_part = it.data();
+    if (part->type() == PT_NOISE || test_part->type() == PT_NOISE) {
+      continue; // Noise stays in sequence.
+    }
+    if (test_part == part->SingletonPartner(false)) {
+      break; // Insert before its lower partner.
+    }
+    int neighbour_bottom = test_part->median_bottom();
+    int neighbour_top = test_part->median_top();
+    int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
+    if (neighbour_y < mid_y) {
+      break; // part is above test_part so insert it.
+    }
+    if (!part->HOverlaps(*test_part) && !part->WithinSameMargins(*test_part)) {
+      continue; // Incompatibles stay in order
+    }
+  }
+  if (it.cycled_list()) {
+    it.add_to_end(part);
+  } else {
+    it.add_before_stay_put(part);
+  }
+}
+
+// Add everything from the temp list to the work_set assuming correct order.
+void ColumnFinder::EmptyTempPartList(ColPartition_CLIST *temp_list, WorkingPartSet_LIST *work_set) {
+  ColPartition_C_IT it(temp_list);
+  while (!it.empty()) {
+    it.extract()->AddToWorkingSet(bleft_, tright_, resolution_, &good_parts_, work_set);
+    it.forward();
+  }
+}
+
+// Transform the grid of partitions to the output blocks.
+void ColumnFinder::TransformToBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {
+  WorkingPartSet_LIST work_set;
+  ColPartitionSet *column_set = nullptr;
+  ColPartition_IT noise_it(&noise_parts_);
+  // The temp_part_list holds a list of parts at the same grid y coord
+  // so they can be added in the correct order. This prevents thin objects
+  // like horizontal lines going before the text lines above them.
+  ColPartition_CLIST temp_part_list;
+  // Iterate the ColPartitions in the grid. It starts at the top
+  ColPartitionGridSearch gsearch(&part_grid_);
+  gsearch.StartFullSearch();
+  int prev_grid_y = -1;
+  ColPartition *part;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    int grid_y = gsearch.GridY();
+    if (grid_y != prev_grid_y) {
+      EmptyTempPartList(&temp_part_list, &work_set);
+      prev_grid_y = grid_y;
+    }
+    if (best_columns_[grid_y] != column_set) {
+      column_set = best_columns_[grid_y];
+      // Every line should have a non-null best column.
+      ASSERT_HOST(column_set != nullptr);
+      column_set->ChangeWorkColumns(bleft_, tright_, resolution_, &good_parts_, &work_set);
+      if (textord_debug_tabfind) {
+        tprintf("Changed column groups at grid index %d, y=%d\n", gsearch.GridY(),
+                gsearch.GridY() * gridsize());
+      }
+    }
+    if (part->type() == PT_NOISE) {
+      noise_it.add_to_end(part);
+    } else {
+      AddToTempPartList(part, &temp_part_list);
+    }
+  }
+  EmptyTempPartList(&temp_part_list, &work_set);
+  // Now finish all working sets and transfer ColPartitionSets to block_sets.
+  WorkingPartSet_IT work_it(&work_set);
+  while (!work_it.empty()) {
+    WorkingPartSet *working_set = work_it.extract();
+    working_set->ExtractCompletedBlocks(bleft_, tright_, resolution_, &good_parts_, blocks,
+                                        to_blocks);
+    delete working_set;
+    work_it.forward();
+  }
+}
+
+// Helper reflects a list of blobs in the y-axis.
+// Only reflects the BLOBNBOX bounding box. Not the blobs or outlines below.
+static void ReflectBlobList(BLOBNBOX_LIST *bblobs) {
+  BLOBNBOX_IT it(bblobs);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    it.data()->reflect_box_in_y_axis();
+  }
+}
+
+// Reflect the blob boxes (but not the outlines) in the y-axis so that
+// the blocks get created in the correct RTL order. Reflects the blobs
+// in the input_block and the bblobs list.
+// The reflection is undone in RotateAndReskewBlocks by
+// reflecting the blocks themselves, and then recomputing the blob bounding
+// boxes.
+void ColumnFinder::ReflectForRtl(TO_BLOCK *input_block, BLOBNBOX_LIST *bblobs) {
+  ReflectBlobList(bblobs);
+  ReflectBlobList(&input_block->blobs);
+  ReflectBlobList(&input_block->small_blobs);
+  ReflectBlobList(&input_block->noise_blobs);
+  ReflectBlobList(&input_block->large_blobs);
+  // Update the denorm with the reflection.
+  auto *new_denorm = new DENORM;
+  new_denorm->SetupNormalization(nullptr, nullptr, denorm_, 0.0f, 0.0f, -1.0f, 1.0f, 0.0f, 0.0f);
+  denorm_ = new_denorm;
+}
+
+// Helper fixes up blobs and cblobs to match the desired rotation,
+// exploding multi-outline blobs back to single blobs and accumulating
+// the bounding box widths and heights.
+static void RotateAndExplodeBlobList(const FCOORD &blob_rotation, BLOBNBOX_LIST *bblobs,
+                                     STATS *widths, STATS *heights) {
+  BLOBNBOX_IT it(bblobs);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    BLOBNBOX *blob = it.data();
+    C_BLOB *cblob = blob->cblob();
+    C_OUTLINE_LIST *outlines = cblob->out_list();
+    C_OUTLINE_IT ol_it(outlines);
+    if (!outlines->singleton()) {
+      // This blob has multiple outlines from CJK repair.
+      // Explode the blob back into individual outlines.
+      for (; !ol_it.empty(); ol_it.forward()) {
+        C_OUTLINE *outline = ol_it.extract();
+        BLOBNBOX *new_blob = BLOBNBOX::RealBlob(outline);
+        // This blob will be revisited later since we add_after_stay_put here.
+        // This means it will get rotated and have its width/height added to
+        // the stats below.
+        it.add_after_stay_put(new_blob);
+      }
+      it.extract();
+      delete blob;
+    } else {
+      if (blob_rotation.x() != 1.0f || blob_rotation.y() != 0.0f) {
+        cblob->rotate(blob_rotation);
+      }
+      blob->compute_bounding_box();
+      widths->add(blob->bounding_box().width(), 1);
+      heights->add(blob->bounding_box().height(), 1);
+    }
+  }
+}
+
+// Undo the deskew that was done in FindTabVectors, as recognition is done
+// without correcting blobs or blob outlines for skew.
+// Reskew the completed blocks to put them back to the original rotated coords
+// that were created by CorrectOrientation.
+// If the input_is_rtl, then reflect the blocks in the y-axis to undo the
+// reflection that was done before FindTabVectors.
+// Blocks that were identified as vertical text (relative to the rotated
+// coordinates) are further rotated so the text lines are horizontal.
+// blob polygonal outlines are rotated to match the position of the blocks
+// that they are in, and their bounding boxes are recalculated to be accurate.
+// Record appropriate inverse transformations and required
+// classifier transformation in the blocks.
+void ColumnFinder::RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST *blocks) {
+  if (input_is_rtl) {
+    // The skew is backwards because of the reflection.
+    FCOORD tmp = deskew_;
+    deskew_ = reskew_;
+    reskew_ = tmp;
+  }
+  TO_BLOCK_IT it(blocks);
+  int block_index = 1;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    TO_BLOCK *to_block = it.data();
+    BLOCK *block = to_block->block;
+    // Blocks are created on the deskewed blob outlines in TransformToBlocks()
+    // so we need to reskew them back to page coordinates.
+    if (input_is_rtl) {
+      block->reflect_polygon_in_y_axis();
+    }
+    block->rotate(reskew_);
+    // Copy the right_to_left flag to the created block.
+    block->set_right_to_left(input_is_rtl);
+    // Save the skew angle in the block for baseline computations.
+    block->set_skew(reskew_);
+    block->pdblk.set_index(block_index++);
+    FCOORD blob_rotation = ComputeBlockAndClassifyRotation(block);
+    // Rotate all the blobs if needed and recompute the bounding boxes.
+    // Compute the block median blob width and height as we go.
+    STATS widths(0, block->pdblk.bounding_box().width() - 1);
+    STATS heights(0, block->pdblk.bounding_box().height() - 1);
+    RotateAndExplodeBlobList(blob_rotation, &to_block->blobs, &widths, &heights);
+    TO_ROW_IT row_it(to_block->get_rows());
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      TO_ROW *row = row_it.data();
+      RotateAndExplodeBlobList(blob_rotation, row->blob_list(), &widths, &heights);
+    }
+    block->set_median_size(static_cast<int>(widths.median() + 0.5),
+                           static_cast<int>(heights.median() + 0.5));
+    if (textord_debug_tabfind >= 2) {
+      tprintf("Block median size = (%d, %d)\n", block->median_size().x(), block->median_size().y());
+    }
+  }
+}
+
+// Computes the rotations for the block (to make textlines horizontal) and
+// for the blobs (for classification) and sets the appropriate members
+// of the given block.
+// Returns the rotation that needs to be applied to the blobs to make
+// them sit in the rotated block.
+FCOORD ColumnFinder::ComputeBlockAndClassifyRotation(BLOCK *block) {
+  // The text_rotation_ tells us the gross page text rotation that needs
+  // to be applied for classification
+  // TODO(rays) find block-level classify rotation by orientation detection.
+  // In the mean time, assume that "up" for text printed in the minority
+  // direction (PT_VERTICAL_TEXT) is perpendicular to the line of reading.
+  // Accomplish this by zero-ing out the text rotation.  This covers the
+  // common cases of image credits in documents written in Latin scripts
+  // and page headings for predominantly vertically written CJK books.
+  FCOORD classify_rotation(text_rotation_);
+  FCOORD block_rotation(1.0f, 0.0f);
+  if (block->pdblk.poly_block()->isA() == PT_VERTICAL_TEXT) {
+    // Vertical text needs to be 90 degrees rotated relative to the rest.
+    // If the rest has a 90 degree rotation already, use the inverse, making
+    // the vertical text the original way up. Otherwise use 90 degrees
+    // clockwise.
+    if (rerotate_.x() == 0.0f) {
+      block_rotation = rerotate_;
+    } else {
+      block_rotation = FCOORD(0.0f, -1.0f);
+    }
+    block->rotate(block_rotation);
+    classify_rotation = FCOORD(1.0f, 0.0f);
+  }
+  block_rotation.rotate(rotation_);
+  // block_rotation is now what we have done to the blocks. Now do the same
+  // thing to the blobs, but save the inverse rotation in the block, as that
+  // is what we need to DENORM back to the image coordinates.
+  FCOORD blob_rotation(block_rotation);
+  block_rotation.set_y(-block_rotation.y());
+  block->set_re_rotation(block_rotation);
+  block->set_classify_rotation(classify_rotation);
+  if (textord_debug_tabfind) {
+    tprintf("Blk %d, type %d rerotation(%.2f, %.2f), char(%.2f,%.2f), box:", block->pdblk.index(),
+            block->pdblk.poly_block()->isA(), block->re_rotation().x(), block->re_rotation().y(),
+            classify_rotation.x(), classify_rotation.y());
+    block->pdblk.bounding_box().print();
+  }
+  return blob_rotation;
+}
+
+} // namespace tesseract.