Mercurial > hgrepos > Python2 > PyMuPDF

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**********************************************************************
 * tospace.cpp
 *
 * Compute fuzzy word spacing thresholds for each row.
 * I.e. set :   max_nonspace
 *              space_threshold
 *              min_space
 *              kern_size
 *              space_size
 * for each row.
 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
 *
 * Note: functions in this file were originally not members of any
 * class or enclosed by any namespace. Now they are all static members
 * of the Textord class.
 *
 **********************************************************************/

#include "drawtord.h"
#include "statistc.h"
#include "textord.h"
#include "tovars.h"

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#  include "config_auto.h"
#endif

#include <algorithm>
#include <cmath>
#include <memory>

#define MAXSPACING 128 /*max expected spacing in pix */

namespace tesseract {
void Textord::to_spacing(ICOORD page_tr,       // topright of page
                         TO_BLOCK_LIST *blocks // blocks on page
) {
  TO_BLOCK_IT block_it; // iterator
  TO_BLOCK *block;      // current block;
  TO_ROW *row;          // current row
  int block_index;      // block number
  int row_index;        // row number
  // estimated width of real spaces for whole block
  int16_t block_space_gap_width;
  // estimated width of non space gaps for whole block
  int16_t block_non_space_gap_width;
  bool old_text_ord_proportional; // old fixed/prop result

  block_it.set_to_list(blocks);
  block_index = 1;
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    block = block_it.data();
    std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
    block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
                        block_non_space_gap_width);
    // Make sure relative values of block-level space and non-space gap
    // widths are reasonable. The ratio of 1:3 is also used in
    // block_spacing_stats, to correct the block_space_gap_width.
    // Useful for arabic and hindi, when the non-space gap width is
    // often over-estimated and should not be trusted. A similar ratio
    // is found in block_spacing_stats.
    if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
        block_non_space_gap_width > block_space_gap_width / 3) {
      block_non_space_gap_width = block_space_gap_width / 3;
    }
    // row iterator
    TO_ROW_IT row_it(block->get_rows());
    row_index = 1;
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      row = row_it.data();
      if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
        if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
          tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
        }
        row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
                          block_non_space_gap_width);
      } else {
        if ((tosp_debug_level > 0) && old_text_ord_proportional) {
          tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
                  row_index, row->pitch_decision, row->fixed_pitch);
        }
      }
#ifndef GRAPHICS_DISABLED
      if (textord_show_initial_words) {
        plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
      }
#endif
      row_index++;
    }
    block_index++;
  }
}

/*************************************************************************
 * block_spacing_stats()
 *************************************************************************/

void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
                                  int16_t &block_space_gap_width,    // resulting estimate
                                  int16_t &block_non_space_gap_width // resulting estimate
) {
  TO_ROW *row;         // current row
  BLOBNBOX_IT blob_it; // iterator

  STATS centre_to_centre_stats(0, MAXSPACING - 1);
  // DEBUG USE ONLY
  STATS all_gap_stats(0, MAXSPACING - 1);
  STATS space_gap_stats(0, MAXSPACING - 1);
  int16_t minwidth = MAXSPACING; // narrowest blob
  TBOX blob_box;
  TBOX prev_blob_box;
  int16_t centre_to_centre;
  int16_t gap_width;
  float real_space_threshold;
  float iqr_centre_to_centre; // DEBUG USE ONLY
  float iqr_all_gap_stats;    // DEBUG USE ONLY
  int32_t end_of_row;
  int32_t row_length;

  // row iterator
  TO_ROW_IT row_it(block->get_rows());
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    row = row_it.data();
    if (!row->blob_list()->empty() &&
        (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
         (row->pitch_decision == PITCH_CORR_PROP))) {
      blob_it.set_to_list(row->blob_list());
      blob_it.mark_cycle_pt();
      end_of_row = blob_it.data_relative(-1)->bounding_box().right();
      if (tosp_use_pre_chopping) {
        blob_box = box_next_pre_chopped(&blob_it);
      } else if (tosp_stats_use_xht_gaps) {
        blob_box = reduced_box_next(row, &blob_it);
      } else {
        blob_box = box_next(&blob_it);
      }
      row_length = end_of_row - blob_box.left();
      if (blob_box.width() < minwidth) {
        minwidth = blob_box.width();
      }
      prev_blob_box = blob_box;
      while (!blob_it.cycled_list()) {
        if (tosp_use_pre_chopping) {
          blob_box = box_next_pre_chopped(&blob_it);
        } else if (tosp_stats_use_xht_gaps) {
          blob_box = reduced_box_next(row, &blob_it);
        } else {
          blob_box = box_next(&blob_it);
        }
        if (blob_box.width() < minwidth) {
          minwidth = blob_box.width();
        }
        int16_t left = prev_blob_box.right();
        int16_t right = blob_box.left();
        gap_width = right - left;
        if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
          all_gap_stats.add(gap_width, 1);

          centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
          // DEBUG
          centre_to_centre_stats.add(centre_to_centre, 1);
          // DEBUG
        }
        prev_blob_box = blob_box;
      }
    }
  }

  // Inadequate samples
  if (all_gap_stats.get_total() <= 1) {
    block_non_space_gap_width = minwidth;
    block_space_gap_width = -1; // No est. space width
                                // DEBUG
    old_text_ord_proportional = true;
  } else {
    /* For debug only ..... */
    iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
    iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
    old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
    /* .......For debug only */

    /*
The median of the gaps is used as an estimate of the NON-SPACE gap width.
This RELIES on the assumption that there are more gaps WITHIN words than
BETWEEN words in a block

Now try to estimate the width of a real space for all real spaces in the
block. Do this by using a crude threshold to ignore "narrow" gaps, then
find the median of the "wide" gaps and use this.
*/
    block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
    // median gap

    row_it.set_to_list(block->get_rows());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      row = row_it.data();
      if (!row->blob_list()->empty() &&
          (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
           (row->pitch_decision == PITCH_CORR_PROP))) {
        real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
                                        tosp_init_guess_xht_mult * row->xheight);
        blob_it.set_to_list(row->blob_list());
        blob_it.mark_cycle_pt();
        end_of_row = blob_it.data_relative(-1)->bounding_box().right();
        if (tosp_use_pre_chopping) {
          blob_box = box_next_pre_chopped(&blob_it);
        } else if (tosp_stats_use_xht_gaps) {
          blob_box = reduced_box_next(row, &blob_it);
        } else {
          blob_box = box_next(&blob_it);
        }
        row_length = blob_box.left() - end_of_row;
        prev_blob_box = blob_box;
        while (!blob_it.cycled_list()) {
          if (tosp_use_pre_chopping) {
            blob_box = box_next_pre_chopped(&blob_it);
          } else if (tosp_stats_use_xht_gaps) {
            blob_box = reduced_box_next(row, &blob_it);
          } else {
            blob_box = box_next(&blob_it);
          }
          int16_t left = prev_blob_box.right();
          int16_t right = blob_box.left();
          gap_width = right - left;
          if ((gap_width > real_space_threshold) &&
              !ignore_big_gap(row, row_length, gapmap, left, right)) {
            /*
If tosp_use_cert_spaces is enabled, the estimate of the space gap is
restricted to obvious spaces - those wider than half the xht or
those with wide blobs on both sides - i.e not things that are
suspect 1's or punctuation that is sometimes widely spaced.
*/
            if (!tosp_block_use_cert_spaces ||
                (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
                ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
                 (!tosp_narrow_blobs_not_cert ||
                  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
                (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
              space_gap_stats.add(gap_width, 1);
            }
          }
          prev_blob_box = blob_box;
        }
      }
    }
    // Inadequate samples
    if (space_gap_stats.get_total() <= 2) {
      block_space_gap_width = -1; // No est. space width
    } else {
      block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
                                       static_cast<int16_t>(3 * block_non_space_gap_width));
    }
  }
}

/*************************************************************************
 * row_spacing_stats()
 * Set values for min_space, max_non_space based on row stats only
 * If failure - return 0 values.
 *************************************************************************/
void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
                                int16_t block_space_gap_width,    // estimate for block
                                int16_t block_non_space_gap_width // estimate for block
) {
  // iterator
  BLOBNBOX_IT blob_it = row->blob_list();
  STATS all_gap_stats(0, MAXSPACING - 1);
  STATS cert_space_gap_stats(0, MAXSPACING - 1);
  STATS all_space_gap_stats(0, MAXSPACING - 1);
  STATS small_gap_stats(0, MAXSPACING - 1);
  TBOX blob_box;
  TBOX prev_blob_box;
  int16_t gap_width;
  int16_t real_space_threshold = 0;
  int16_t max = 0;
  int16_t large_gap_count = 0;
  bool suspected_table;
  bool good_block_space_estimate = block_space_gap_width > 0;
  int32_t end_of_row;
  int32_t row_length = 0;
  float sane_space;
  int32_t sane_threshold;

  /* Collect first pass stats for row */

  if (!good_block_space_estimate) {
    block_space_gap_width = int16_t(std::floor(row->xheight / 2));
  }
  if (!row->blob_list()->empty()) {
    if (tosp_threshold_bias1 > 0) {
      real_space_threshold =
          block_non_space_gap_width +
          int16_t(floor(0.5 + tosp_threshold_bias1 *
                                  (block_space_gap_width - block_non_space_gap_width)));
    } else {
      real_space_threshold = // Old TO method
          (block_space_gap_width + block_non_space_gap_width) / 2;
    }
    blob_it.set_to_list(row->blob_list());
    blob_it.mark_cycle_pt();
    end_of_row = blob_it.data_relative(-1)->bounding_box().right();
    if (tosp_use_pre_chopping) {
      blob_box = box_next_pre_chopped(&blob_it);
    } else if (tosp_stats_use_xht_gaps) {
      blob_box = reduced_box_next(row, &blob_it);
    } else {
      blob_box = box_next(&blob_it);
    }
    row_length = end_of_row - blob_box.left();
    prev_blob_box = blob_box;
    while (!blob_it.cycled_list()) {
      if (tosp_use_pre_chopping) {
        blob_box = box_next_pre_chopped(&blob_it);
      } else if (tosp_stats_use_xht_gaps) {
        blob_box = reduced_box_next(row, &blob_it);
      } else {
        blob_box = box_next(&blob_it);
      }
      int16_t left = prev_blob_box.right();
      int16_t right = blob_box.left();
      gap_width = right - left;
      if (ignore_big_gap(row, row_length, gapmap, left, right)) {
        large_gap_count++;
      } else {
        if (gap_width >= real_space_threshold) {
          if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
              ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
               (!tosp_narrow_blobs_not_cert ||
                (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
              (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
            cert_space_gap_stats.add(gap_width, 1);
          }
          all_space_gap_stats.add(gap_width, 1);
        } else {
          small_gap_stats.add(gap_width, 1);
        }
        all_gap_stats.add(gap_width, 1);
      }
      prev_blob_box = blob_box;
    }
  }
  suspected_table = (large_gap_count > 1) ||
                    ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));

  /* Now determine row kern size, space size and threshold */

  if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
      ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
       cert_space_gap_stats.get_total() > 0)) {
    old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
                  block_space_gap_width, block_non_space_gap_width);
  } else {
    if (!tosp_recovery_isolated_row_stats ||
        !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
      if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
        tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
      }
      if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
        // Use block default
        row->space_size = block_space_gap_width;
        if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
          row->kern_size = all_gap_stats.median();
        } else {
          row->kern_size = block_non_space_gap_width;
        }
        row->space_threshold =
            int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
      } else {
        old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
                      block_space_gap_width, block_non_space_gap_width);
      }
    }
  }

  if (tosp_improve_thresh && !suspected_table) {
    improve_row_threshold(row, &all_gap_stats);
  }

  /* Now lets try to be careful not to do anything silly with tables when we
are ignoring big gaps*/
  if (tosp_sanity_method == 0) {
    if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
      if (tosp_debug_level > 5) {
        tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
                row->kern_size, row->space_threshold, row->space_size);
      }
      row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
      row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
    }
  } else if (tosp_sanity_method == 1) {
    sane_space = row->space_size;
    /* NEVER let space size get too close to kern size */
    if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
        ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
      if (good_block_space_estimate &&
          (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
        sane_space = block_space_gap_width;
      } else {
        sane_space =
            std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
                     row->xheight / 2.0f);
      }
      if (tosp_debug_level > 5) {
        tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
                row->kern_size, row->space_threshold, row->space_size, sane_space);
      }
      row->space_size = sane_space;
      row->space_threshold =
          int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
    }
    /* NEVER let threshold get VERY far away from kern */
    sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
    if (row->space_threshold > sane_threshold) {
      if (tosp_debug_level > 5) {
        tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
                row->kern_size, row->space_threshold, row->space_size, sane_threshold);
      }
      row->space_threshold = sane_threshold;
      if (row->space_size <= sane_threshold) {
        row->space_size = row->space_threshold + 1.0f;
      }
    }
    /* Beware of tables - there may be NO spaces */
    if (suspected_table) {
      sane_space =
          std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
      sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));

      if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
        if (tosp_debug_level > 5) {
          tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
                  row->kern_size, row->space_threshold, row->space_size);
        }
        // the minimum sane value
        row->space_threshold = static_cast<int32_t>(sane_space);
        row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
      }
    }
  }

  /* Now lets try to put some error limits on the threshold */

  if (tosp_old_to_method) {
    /* Old textord made a space if gap >= threshold */
    // NO FUZZY SPACES YET
    row->max_nonspace = row->space_threshold;
    // NO FUZZY SPACES       YET
    row->min_space = row->space_threshold + 1;
  } else {
    /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
    row->min_space =
        std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
    if (row->min_space <= row->space_threshold) {
      // Don't be silly
      row->min_space = row->space_threshold + 1;
    }
    /*
Lets try to guess the max certain kern gap by looking at the cluster of
kerns for the row. The row is proportional so the kerns should cluster
tightly at the bottom of the distribution. We also expect most gaps to be
kerns. Find the maximum of the kern piles between 0 and twice the kern
estimate. Piles before the first one with less than 1/10 the maximum
number of samples can be taken as certain kerns.

  Of course, there are some cases where the kern peak and space peaks merge,
  so we will put an UPPER limit on the max certain kern gap of some fraction
  below the threshold.
*/

    // upper bound
    int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);

    // default
    row->max_nonspace = max_max_nonspace;
    for (int32_t index = 0; index <= max_max_nonspace; index++) {
      if (all_gap_stats.pile_count(index) > max) {
        max = all_gap_stats.pile_count(index);
      }
      if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
        row->max_nonspace = index;
        break;
      }
    }
  }

  /* Yet another algorithm - simpler this time - just choose a fraction of the
threshold to space range */

  if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
    row->min_space = std::max(
        row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
                                                  tosp_fuzzy_sp_fraction *
                                                      (row->space_size - row->space_threshold))));
  }

  /* Ensure that ANY space less than some multiplier times the kern size is
fuzzy.  In tables there is a risk of erroneously setting a small space size
when there are no real spaces. Sometimes tables have text squashed into
columns so that the kn->sp ratio is small anyway - this means that we can't
use this to force a wider separation - hence we rely on context to join any
dubious breaks. */

  if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
    row->min_space = std::max(
        row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
  }

  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
    row->max_nonspace = static_cast<int32_t>(floor(
        0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
  }
  if (row->max_nonspace > row->space_threshold) {
    // Don't be silly
    row->max_nonspace = row->space_threshold;
  }

  if (tosp_debug_level > 5) {
    tprintf(
        "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
        "Sp:%3.2f\n",
        block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
        real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
        row->min_space, row->space_size);
  }
  if (tosp_debug_level > 10) {
    tprintf(
        "row->kern_size = %3.2f, row->space_size = %3.2f, "
        "row->space_threshold = %d\n",
        row->kern_size, row->space_size, row->space_threshold);
  }
}

void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
                            STATS *small_gap_stats,
                            int16_t block_space_gap_width,    // estimate for block
                            int16_t block_non_space_gap_width // estimate for block
) {
  /* First, estimate row space size */
  /* Old to condition was > 2 */
  if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
    // Adequate samples
    /* Set space size to median of spaces BUT limits it if it seems wildly out
     */
    row->space_size = space_gap_stats->median();
    if (row->space_size > block_space_gap_width * 1.5) {
      if (tosp_old_to_bug_fix) {
        row->space_size = block_space_gap_width * 1.5;
      } else {
        // BUG??? should be *1.5
        row->space_size = block_space_gap_width;
      }
    }
    if (row->space_size < (block_non_space_gap_width * 2) + 1) {
      row->space_size = (block_non_space_gap_width * 2) + 1;
    }
  }
  // Only 1 or 2 samples
  else if (space_gap_stats->get_total() >= 1) {
    // hence mean not median
    row->space_size = space_gap_stats->mean();
    if (row->space_size > block_space_gap_width * 1.5) {
      if (tosp_old_to_bug_fix) {
        row->space_size = block_space_gap_width * 1.5;
      } else {
        // BUG??? should be *1.5
        row->space_size = block_space_gap_width;
      }
    }
    if (row->space_size < (block_non_space_gap_width * 3) + 1) {
      row->space_size = (block_non_space_gap_width * 3) + 1;
    }
  } else {
    // Use block default
    row->space_size = block_space_gap_width;
  }

  /* Next, estimate row kern size */
  if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
    row->kern_size = small_gap_stats->median();
  } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
    row->kern_size = all_gap_stats->median();
  } else { // old TO -SAME FOR ALL ROWS
    row->kern_size = block_non_space_gap_width;
  }

  /* Finally, estimate row space threshold */
  if (tosp_threshold_bias2 > 0) {
    row->space_threshold = int32_t(
        floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
  } else {
    /*
  NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold
and holds this in a float. The use is with a >= test
NEW textord uses an integer threshold and a > test
It comes to the same thing.
  (Though there is a difference in that old textor has integer space_size
  and kern_size.)
*/
    row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
  }

  // Apply the same logic and ratios as in row_spacing_stats to
  // restrict relative values of the row's space_size, kern_size, and
  // space_threshold
  if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
      ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
       ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
    if (row->kern_size > 2.5) {
      row->kern_size = row->space_size / tosp_min_sane_kn_sp;
    }
    row->space_threshold =
        int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
  }
}

/*************************************************************************
 * isolated_row_stats()
 * Set values for min_space, max_non_space based on row stats only
 *************************************************************************/
bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
                                 bool suspected_table, int16_t block_idx, int16_t row_idx) {
  float kern_estimate;
  float crude_threshold_estimate;
  int16_t small_gaps_count;
  int16_t total;
  // iterator
  BLOBNBOX_IT blob_it = row->blob_list();
  STATS cert_space_gap_stats(0, MAXSPACING - 1);
  STATS all_space_gap_stats(0, MAXSPACING - 1);
  STATS small_gap_stats(0, MAXSPACING - 1);
  TBOX blob_box;
  TBOX prev_blob_box;
  int16_t gap_width;
  int32_t end_of_row;
  int32_t row_length;

  kern_estimate = all_gap_stats->median();
  crude_threshold_estimate =
      std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
  small_gaps_count =
      stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
  total = all_gap_stats->get_total();

  if ((total <= tosp_redo_kern_limit) ||
      ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
      (total - small_gaps_count < 1)) {
    if (tosp_debug_level > 5) {
      tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
    }
    return false;
  }
  blob_it.set_to_list(row->blob_list());
  blob_it.mark_cycle_pt();
  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
  if (tosp_use_pre_chopping) {
    blob_box = box_next_pre_chopped(&blob_it);
  } else if (tosp_stats_use_xht_gaps) {
    blob_box = reduced_box_next(row, &blob_it);
  } else {
    blob_box = box_next(&blob_it);
  }
  row_length = end_of_row - blob_box.left();
  prev_blob_box = blob_box;
  while (!blob_it.cycled_list()) {
    if (tosp_use_pre_chopping) {
      blob_box = box_next_pre_chopped(&blob_it);
    } else if (tosp_stats_use_xht_gaps) {
      blob_box = reduced_box_next(row, &blob_it);
    } else {
      blob_box = box_next(&blob_it);
    }
    int16_t left = prev_blob_box.right();
    int16_t right = blob_box.left();
    gap_width = right - left;
    if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
        (gap_width > crude_threshold_estimate)) {
      if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
          ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
           (!tosp_narrow_blobs_not_cert ||
            (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
          (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
        cert_space_gap_stats.add(gap_width, 1);
      }
      all_space_gap_stats.add(gap_width, 1);
    }
    if (gap_width < crude_threshold_estimate) {
      small_gap_stats.add(gap_width, 1);
    }

    prev_blob_box = blob_box;
  }
  if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
    // median
    row->space_size = cert_space_gap_stats.median();
  } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
    // to avoid spaced
    row->space_size = cert_space_gap_stats.mean();
  //      1's in tables
  } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
    // median
    row->space_size = all_space_gap_stats.median();
  } else {
    row->space_size = all_space_gap_stats.mean();
  }

  if (tosp_only_small_gaps_for_kern) {
    row->kern_size = small_gap_stats.median();
  } else {
    row->kern_size = all_gap_stats->median();
  }
  row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
  /* Sanity check */
  if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
      (row->space_threshold <= 0)) {
    if (tosp_debug_level > 5) {
      tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
              row->kern_size, row->space_threshold, row->space_size);
    }
    row->kern_size = 0.0f;
    row->space_threshold = 0;
    row->space_size = 0.0f;
    return false;
  }

  if (tosp_debug_level > 5) {
    tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
            row->space_threshold, row->space_size);
  }
  return true;
}

int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
  int16_t index;
  int16_t total = 0;

  for (index = 0; index < threshold; index++) {
    total += stats->pile_count(index);
  }
  return total;
}

/*************************************************************************
 * improve_row_threshold()
 *    Try to recognise a "normal line" -
 *           > 25 gaps
 *     &&    space > 3 * kn  && space > 10
 *              (I.e. reasonably large space and kn:sp ratio)
 *     &&    > 3/4 # gaps < kn + (sp - kn)/3
 *              (I.e. most gaps are well away from space estimate)
 *     &&    a gap of max(3, (sp - kn) / 3) empty histogram positions is found
 *           somewhere in the histogram between kn and sp
 *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
 *          NO!!!!! the bristol line has "11" with a gap of 12 between the
 *1's!!! try moving the default threshold to within this band but leave the
 *          fuzzy limit calculation as at present.
 *************************************************************************/
void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
  float sp = row->space_size;
  float kn = row->kern_size;
  int16_t reqd_zero_width = 0;
  int16_t zero_width = 0;
  int16_t zero_start = 0;
  int16_t index = 0;

  if (tosp_debug_level > 10) {
    tprintf("Improve row threshold 0");
  }
  if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
      (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
       (0.75 * all_gap_stats->get_total()))) {
    return;
  }
  if (tosp_debug_level > 10) {
    tprintf(" 1");
  }
  /*
Look for the first region of all 0's in the histogram which is wider than
max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
threshold is not within it, move the threshold so that is just inside it.
*/
  reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
  if (reqd_zero_width < 3) {
    reqd_zero_width = 3;
  }

  for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
    if (all_gap_stats->pile_count(index) == 0) {
      if (zero_width == 0) {
        zero_start = index;
      }
      zero_width++;
    } else {
      if (zero_width >= reqd_zero_width) {
        break;
      } else {
        zero_width = 0;
      }
    }
  }
  index--;
  if (tosp_debug_level > 10) {
    tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
            zero_width, zero_start, row->space_threshold);
  }
  if ((zero_width < reqd_zero_width) ||
      ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
    return;
  }
  if (tosp_debug_level > 10) {
    tprintf(" 2");
  }
  if (row->space_threshold < zero_start) {
    if (tosp_debug_level > 5) {
      tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
              index, row->space_threshold, zero_start);
    }
    row->space_threshold = zero_start;
  }
  if (row->space_threshold > index) {
    if (tosp_debug_level > 5) {
      tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
              index, row->space_threshold, index);
    }
    row->space_threshold = index;
  }
}

/**********************************************************************
 * make_prop_words
 *
 * Convert a TO_ROW to a ROW.
 **********************************************************************/
ROW *Textord::make_prop_words(TO_ROW *row,    // row to make
                              FCOORD rotation // for drawing
) {
  bool bol; // start of line
  /* prev_ values are for start of word being built. non prev_ values are for
the gap between the word being built and the next one. */
  bool prev_fuzzy_sp;     // probably space
  bool prev_fuzzy_non;    // probably not
  uint8_t prev_blanks;    // in front of word
  bool fuzzy_sp = false;  // probably space
  bool fuzzy_non = false; // probably not
  uint8_t blanks = 0;     // in front of word
  bool prev_gap_was_a_space = false;
  bool break_at_next_gap = false;
  ROW *real_row; // output row
  C_OUTLINE_IT cout_it;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;
  WERD_LIST words;
  WERD *word; // new word
  int32_t next_rep_char_word_right = INT32_MAX;
  float repetition_spacing; // gap between repetitions
  int32_t xstarts[2];       // row ends
  int32_t prev_x;           // end of prev blob
  BLOBNBOX_IT box_it;       // iterator
  TBOX prev_blob_box;
  TBOX next_blob_box;
  int16_t prev_gap = INT16_MAX;
  int16_t current_gap = INT16_MAX;
  int16_t next_gap = INT16_MAX;
  int16_t prev_within_xht_gap = INT16_MAX;
  int16_t current_within_xht_gap = INT16_MAX;
  int16_t next_within_xht_gap = INT16_MAX;
  int16_t word_count = 0;

  // repeated char words
  WERD_IT rep_char_it(&(row->rep_words));
  if (!rep_char_it.empty()) {
    next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
  }

  prev_x = -INT16_MAX;
  cblob_it.set_to_list(&cblobs);
  box_it.set_to_list(row->blob_list());
  // new words
  WERD_IT word_it(&words);
  bol = true;
  prev_blanks = 0;
  prev_fuzzy_sp = false;
  prev_fuzzy_non = false;
  if (!box_it.empty()) {
    xstarts[0] = box_it.data()->bounding_box().left();
    if (xstarts[0] > next_rep_char_word_right) {
      /* We need to insert a repeated char word at the start of the row */
      word = rep_char_it.extract();
      word_it.add_after_then_move(word);
      /* Set spaces before repeated char word */
      word->set_flag(W_BOL, true);
      bol = false;
      word->set_blanks(0);
      // NO uncertainty
      word->set_flag(W_FUZZY_SP, false);
      word->set_flag(W_FUZZY_NON, false);
      xstarts[0] = word->bounding_box().left();
      /* Set spaces after repeated char word (and leave current word set) */
      repetition_spacing = find_mean_blob_spacing(word);
      current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
      current_within_xht_gap = current_gap;
      if (current_gap > tosp_rep_space * repetition_spacing) {
        prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
        if (prev_blanks < 1) {
          prev_blanks = 1;
        }
      } else {
        prev_blanks = 0;
      }
      if (tosp_debug_level > 5) {
        tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
                box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
                repetition_spacing, current_gap);
      }
      prev_fuzzy_sp = false;
      prev_fuzzy_non = false;
      if (rep_char_it.empty()) {
        next_rep_char_word_right = INT32_MAX;
      } else {
        rep_char_it.forward();
        next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
      }
    }

    peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
    do {
      auto bblob = box_it.data();
      auto blob_box = bblob->bounding_box();
      if (bblob->joined_to_prev()) {
        auto cblob = bblob->remove_cblob();
        if (cblob != nullptr) {
          cout_it.set_to_list(cblob_it.data()->out_list());
          cout_it.move_to_last();
          cout_it.add_list_after(cblob->out_list());
          delete cblob;
        }
      } else {
        auto cblob = bblob->cblob();
        if (cblob != nullptr) {
          bblob->set_owns_cblob(false);
          cblob_it.add_after_then_move(cblob);
        }
        prev_x = blob_box.right();
      }
      box_it.forward(); // next one
      bblob = box_it.data();
      blob_box = bblob->bounding_box();

      if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
        /* Real Blob - not multiple outlines or pre-chopped */
        prev_gap = current_gap;
        prev_within_xht_gap = current_within_xht_gap;
        prev_blob_box = next_blob_box;
        current_gap = next_gap;
        current_within_xht_gap = next_within_xht_gap;
        peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);

        int16_t prev_gap_arg = prev_gap;
        int16_t next_gap_arg = next_gap;
        if (tosp_only_use_xht_gaps) {
          prev_gap_arg = prev_within_xht_gap;
          next_gap_arg = next_within_xht_gap;
        }
        // Decide if a word-break should be inserted
        if (blob_box.left() > next_rep_char_word_right ||
            make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
                              current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
                              fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
            box_it.at_first()) {
          /* Form a new word out of the blobs collected */
          word = new WERD(&cblobs, prev_blanks, nullptr);
          word_count++;
          word_it.add_after_then_move(word);
          if (bol) {
            word->set_flag(W_BOL, true);
            bol = false;
          }
          if (prev_fuzzy_sp) {
            // probably space
            word->set_flag(W_FUZZY_SP, true);
          } else if (prev_fuzzy_non) {
            word->set_flag(W_FUZZY_NON, true);
          }
          // probably not

          if (blob_box.left() > next_rep_char_word_right) {
            /* We need to insert a repeated char word */
            word = rep_char_it.extract();
            word_it.add_after_then_move(word);

            /* Set spaces before repeated char word */
            repetition_spacing = find_mean_blob_spacing(word);
            current_gap = word->bounding_box().left() - prev_x;
            current_within_xht_gap = current_gap;
            if (current_gap > tosp_rep_space * repetition_spacing) {
              blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
              if (blanks < 1) {
                blanks = 1;
              }
            } else {
              blanks = 0;
            }
            if (tosp_debug_level > 5) {
              tprintf("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
                      word->bounding_box().left(), word->bounding_box().bottom(),
                      repetition_spacing, current_gap, blanks);
            }
            word->set_blanks(blanks);
            // NO uncertainty
            word->set_flag(W_FUZZY_SP, false);
            word->set_flag(W_FUZZY_NON, false);

            /* Set spaces after repeated char word (and leave current word set)
             */
            current_gap = blob_box.left() - next_rep_char_word_right;
            if (current_gap > tosp_rep_space * repetition_spacing) {
              blanks = static_cast<uint8_t>(current_gap / row->space_size);
              if (blanks < 1) {
                blanks = 1;
              }
            } else {
              blanks = 0;
            }
            if (tosp_debug_level > 5) {
              tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
            }
            fuzzy_sp = false;
            fuzzy_non = false;

            if (rep_char_it.empty()) {
              next_rep_char_word_right = INT32_MAX;
            } else {
              rep_char_it.forward();
              next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
            }
          }

          if (box_it.at_first() && rep_char_it.empty()) {
            // at end of line
            word->set_flag(W_EOL, true);
            xstarts[1] = prev_x;
          } else {
            prev_blanks = blanks;
            prev_fuzzy_sp = fuzzy_sp;
            prev_fuzzy_non = fuzzy_non;
          }
        }
      }
    } while (!box_it.at_first()); // until back at start

    /* Insert any further repeated char words */
    while (!rep_char_it.empty()) {
      word = rep_char_it.extract();
      word_it.add_after_then_move(word);

      /* Set spaces before repeated char word */
      repetition_spacing = find_mean_blob_spacing(word);
      current_gap = word->bounding_box().left() - prev_x;
      if (current_gap > tosp_rep_space * repetition_spacing) {
        blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
        if (blanks < 1) {
          blanks = 1;
        }
      } else {
        blanks = 0;
      }
      if (tosp_debug_level > 5) {
        tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
                word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
                current_gap, blanks);
      }
      word->set_blanks(blanks);
      // NO uncertainty
      word->set_flag(W_FUZZY_SP, false);
      word->set_flag(W_FUZZY_NON, false);
      prev_x = word->bounding_box().right();
      if (rep_char_it.empty()) {
        // at end of line
        word->set_flag(W_EOL, true);
        xstarts[1] = prev_x;
      } else {
        rep_char_it.forward();
      }
    }
    real_row =
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
    word_it.set_to_list(real_row->word_list());
    // put words in row
    word_it.add_list_after(&words);
    real_row->recalc_bounding_box();

    if (tosp_debug_level > 4) {
      tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
              real_row->bounding_box().left(), real_row->bounding_box().bottom(),
              real_row->bounding_box().right(), real_row->bounding_box().top());
    }
    return real_row;
  }
  return nullptr;
}

/**********************************************************************
 * make_blob_words
 *
 * Converts words into blobs so that each blob is a single character.
 *  Used for chopper test.
 **********************************************************************/
ROW *Textord::make_blob_words(TO_ROW *row,    // row to make
                              FCOORD rotation // for drawing
) {
  bool bol;      // start of line
  ROW *real_row; // output row
  C_OUTLINE_IT cout_it;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;
  WERD_LIST words;
  WERD *word;         // new word
  BLOBNBOX_IT box_it; // iterator
  int16_t word_count = 0;

  cblob_it.set_to_list(&cblobs);
  box_it.set_to_list(row->blob_list());
  // new words
  WERD_IT word_it(&words);
  bol = true;
  if (!box_it.empty()) {
    do {
      auto bblob = box_it.data();
      auto blob_box = bblob->bounding_box();
      if (bblob->joined_to_prev()) {
        auto cblob = bblob->remove_cblob();
        if (cblob != nullptr) {
          cout_it.set_to_list(cblob_it.data()->out_list());
          cout_it.move_to_last();
          cout_it.add_list_after(cblob->out_list());
          delete cblob;
        }
      } else {
        auto cblob = bblob->cblob();
        if (cblob != nullptr) {
          bblob->set_owns_cblob(false);
          cblob_it.add_after_then_move(cblob);
        }
      }
      box_it.forward(); // next one
      bblob = box_it.data();
      blob_box = bblob->bounding_box();

      if (!bblob->joined_to_prev() && !cblobs.empty()) {
        word = new WERD(&cblobs, 1, nullptr);
        word_count++;
        word_it.add_after_then_move(word);
        if (bol) {
          word->set_flag(W_BOL, true);
          bol = false;
        }
        if (box_it.at_first()) { // at end of line
          word->set_flag(W_EOL, true);
        }
      }
    } while (!box_it.at_first()); // until back at start
    /* Setup the row with created words. */
    real_row =
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
    word_it.set_to_list(real_row->word_list());
    // put words in row
    word_it.add_list_after(&words);
    real_row->recalc_bounding_box();
    if (tosp_debug_level > 4) {
      tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
              real_row->bounding_box().left(), real_row->bounding_box().bottom(),
              real_row->bounding_box().right(), real_row->bounding_box().top());
    }
    return real_row;
  }
  return nullptr;
}

bool Textord::make_a_word_break(TO_ROW *row,   // row being made
                                TBOX blob_box, // for next_blob // how many blanks?
                                int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
                                int16_t within_xht_current_gap, TBOX next_blob_box,
                                int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
                                bool &prev_gap_was_a_space, bool &break_at_next_gap) {
  bool space;
  int16_t current_gap;
  float fuzzy_sp_to_kn_limit;

  if (break_at_next_gap) {
    break_at_next_gap = false;
    return true;
  }
  /* Inhibit using the reduced gap if
  The kerning is large - chars are not kerned and reducing "f"s can cause
  erroneous blanks
OR  The real gap is less than 0
OR  The real gap is less than the kerning estimate
*/
  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
      ((tosp_dont_fool_with_small_kerns >= 0) &&
       (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
    // Ignore the difference
    within_xht_current_gap = real_current_gap;
  }

  if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
    current_gap = within_xht_current_gap;
  } else {
    current_gap = real_current_gap;
  }

  if (tosp_old_to_method) {
    // Boring old method
    space = current_gap > row->max_nonspace;
    if (space && (current_gap < INT16_MAX)) {
      if (current_gap < row->min_space) {
        if (current_gap > row->space_threshold) {
          blanks = 1;
          fuzzy_sp = true;
          fuzzy_non = false;
        } else {
          blanks = 0;
          fuzzy_sp = false;
          fuzzy_non = true;
        }
      } else {
        if (row->space_size == 0.0f) {
          // Avoid FP division by 0.
          blanks = 1;
        } else {
          blanks = static_cast<uint8_t>(current_gap / row->space_size);
          if (blanks < 1) {
            blanks = 1;
          }
        }
        fuzzy_sp = false;
        fuzzy_non = false;
      }
    }
    return space;
  } else {
    /* New exciting heuristic method */
    if (prev_blob_box.null_box()) { // Beginning of row
      prev_gap_was_a_space = true;
    }

    // Default as old TO
    space = current_gap > row->space_threshold;

    /* Set defaults for the word break in case we find one.  Currently there are
no fuzzy spaces. Depending on the reliability of the different heuristics
we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
be used if the function returns true - ie the word is to be broken.
*/
    int num_blanks = current_gap;
    if (row->space_size > 1.0f) {
      num_blanks = IntCastRounded(current_gap / row->space_size);
    }
    blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
    fuzzy_sp = false;
    fuzzy_non = false;
    /*
If xht measure causes gap to flip one of the 3 thresholds act accordingly -
despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
context.
*/
    if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
        (within_xht_current_gap > row->max_nonspace)) {
      space = true;
      fuzzy_non = true;
#ifndef GRAPHICS_DISABLED
      mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
               next_gap);
#endif
    } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
               (within_xht_current_gap > row->space_threshold)) {
      space = true;
      if (tosp_flip_fuzz_kn_to_sp) {
        fuzzy_sp = true;
      } else {
        fuzzy_non = true;
      }
#ifndef GRAPHICS_DISABLED
      mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
               next_gap);
#endif
    } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
               (within_xht_current_gap >= row->min_space)) {
      space = true;
#ifndef GRAPHICS_DISABLED
      mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
               next_gap);
#endif
    } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
               suspected_punct_blob(row, blob_box)) {
      break_at_next_gap = true;
    }
    /* Now continue with normal heuristics */
    else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
      /* Heuristics to turn dubious spaces to kerns */
      if (tosp_pass_wide_fuzz_sp_to_context > 0) {
        fuzzy_sp_to_kn_limit =
            row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
      } else {
        fuzzy_sp_to_kn_limit = 99999.0f;
      }

      /* If current gap is significantly smaller than the previous space the
other side of a narrow blob then this gap is a kern. */
      if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
          (current_gap <= tosp_gap_factor * prev_gap)) {
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
          if (tosp_flip_fuzz_sp_to_kn) {
            fuzzy_non = true;
          } else {
            fuzzy_sp = true;
          }
        } else {
          space = false;
        }
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      }
      /* If current gap not much bigger than the previous kern the other side of
a narrow blob then this gap is a kern as well */
      else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
               !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
          if (tosp_flip_fuzz_sp_to_kn) {
            fuzzy_non = true;
          } else {
            fuzzy_sp = true;
          }
        } else {
          space = false;
        }
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
                 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
          if (tosp_flip_fuzz_sp_to_kn) {
            fuzzy_non = true;
          } else {
            fuzzy_sp = true;
          }
        } else {
          space = false;
        }
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
                 (next_gap <= row->space_threshold) &&
                 (current_gap * tosp_gap_factor <= next_gap)) {
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
          if (tosp_flip_fuzz_sp_to_kn) {
            fuzzy_non = true;
          } else {
            fuzzy_sp = true;
          }
        } else {
          space = false;
        }
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
                  ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
        fuzzy_sp = true;
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      }
    } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
      /* Heuristics to turn dubious kerns to spaces */
      /* TRIED THIS BUT IT MADE THINGS WORSE
    if (prev_gap == INT16_MAX)
      prev_gap = 0;  // start of row
    if (next_gap == INT16_MAX)
      next_gap = 0;  // end of row
*/
      if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
          (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
          wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
        space = true;
        /*
tosp_flip_caution is an attempt to stop the default changing in cases
where there is a large difference between the kern and space estimates.
  See problem in 'chiefs' where "have" gets split in the quotation.
*/
        if ((tosp_flip_fuzz_kn_to_sp) &&
            ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
          fuzzy_sp = true;
        } else {
          fuzzy_non = true;
        }
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
                 current_gap > 5 && // Rule 9 handles small gap, big ratio.
                 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
                 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
                 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
        space = true;
        fuzzy_non = true;
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
                 (next_blob_box.width() > 0) &&
                 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
                 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
                                              !suspected_punct_blob(row, next_blob_box)))) {
        space = true;
        fuzzy_non = true;
#ifndef GRAPHICS_DISABLED
        mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
                 next_gap);
#endif
      }
    }
    if (tosp_debug_level > 10) {
      tprintf(
          "word break = %d current_gap = %d, prev_gap = %d, "
          "next_gap = %d\n",
          space ? 1 : 0, current_gap, prev_gap, next_gap);
    }
    prev_gap_was_a_space = space && !(fuzzy_non);
    return space;
  }
}

bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
  bool result;
  result =
      ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
       ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
  return result;
}

bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
  bool result;
  if (tosp_wide_fraction > 0) {
    if (tosp_wide_aspect_ratio > 0) {
      result =
          ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
           ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
    } else {
      result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
    }
  } else {
    result = !narrow_blob(row, blob_box);
  }
  return result;
}

bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
  bool result;
  float baseline;
  float blob_x_centre;
  /* Find baseline of centre of blob */
  blob_x_centre = (box.right() + box.left()) / 2.0;
  baseline = row->baseline.y(blob_x_centre);

  result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
           (box.bottom() > baseline + row->xheight / 2.0);
  return result;
}

void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
                               int16_t &next_gap, int16_t &next_within_xht_gap) {
  TBOX next_reduced_blob_box;
  TBOX bit_beyond;
  BLOBNBOX_IT reduced_box_it = box_it;

  next_blob_box = box_next(&box_it);
  next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
  if (box_it.at_first()) {
    next_gap = INT16_MAX;
    next_within_xht_gap = INT16_MAX;
  } else {
    bit_beyond = box_it.data()->bounding_box();
    next_gap = bit_beyond.left() - next_blob_box.right();
    bit_beyond = reduced_box_next(row, &reduced_box_it);
    next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
  }
}

#ifndef GRAPHICS_DISABLED
void Textord::mark_gap(TBOX blob,    // blob following gap
                       int16_t rule, // heuristic id
                       int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
                       int16_t next_blob_width, int16_t next_gap) {
  ScrollView::Color col; // of ellipse marking flipped gap

  switch (rule) {
    case 1:
      col = ScrollView::RED;
      break;
    case 2:
      col = ScrollView::CYAN;
      break;
    case 3:
      col = ScrollView::GREEN;
      break;
    case 4:
      col = ScrollView::BLACK;
      break;
    case 5:
      col = ScrollView::MAGENTA;
      break;
    case 6:
      col = ScrollView::BLUE;
      break;

    case 7:
      col = ScrollView::WHITE;
      break;
    case 8:
      col = ScrollView::YELLOW;
      break;
    case 9:
      col = ScrollView::BLACK;
      break;

    case 20:
      col = ScrollView::CYAN;
      break;
    case 21:
      col = ScrollView::GREEN;
      break;
    case 22:
      col = ScrollView::MAGENTA;
      break;
    default:
      col = ScrollView::BLACK;
  }
  if (textord_show_initial_words) {
    to_win->Pen(col);
    /*  if (rule < 20)
    //interior_style(to_win, INT_SOLID, false);
  else
    //interior_style(to_win, INT_HOLLOW, true);*/
    // x radius
    to_win->Ellipse(current_gap / 2.0f,
                    blob.height() / 2.0f, // y radius
                                          // x centre
                    blob.left() - current_gap / 2.0f,
                    // y centre
                    blob.bottom() + blob.height() / 2.0f);
  }
  if (tosp_debug_level > 5) {
    tprintf("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
            blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
  }
}
#endif

float Textord::find_mean_blob_spacing(WERD *word) {
  C_BLOB_IT cblob_it;
  TBOX blob_box;
  int32_t gap_sum = 0;
  int16_t gap_count = 0;
  int16_t prev_right;

  cblob_it.set_to_list(word->cblob_list());
  if (!cblob_it.empty()) {
    cblob_it.mark_cycle_pt();
    prev_right = cblob_it.data()->bounding_box().right();
    // first blob
    cblob_it.forward();
    for (; !cblob_it.cycled_list(); cblob_it.forward()) {
      blob_box = cblob_it.data()->bounding_box();
      gap_sum += blob_box.left() - prev_right;
      gap_count++;
      prev_right = blob_box.right();
    }
  }
  if (gap_count > 0) {
    return (gap_sum / static_cast<float>(gap_count));
  } else {
    return 0.0f;
  }
}

bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
                             int16_t right) {
  int16_t gap = right - left + 1;

  if (tosp_ignore_big_gaps > 999) {
    return false; // Don't ignore
  }
  if (tosp_ignore_big_gaps > 0) {
    return (gap > tosp_ignore_big_gaps * row->xheight);
  }
  if (gap > tosp_ignore_very_big_gaps * row->xheight) {
    return true;
  }
  if (tosp_ignore_big_gaps == 0) {
    if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
      return true;
    }
    if ((gap > 1.75 * row->xheight) &&
        ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
      return true;
    }
  } else {
    /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
     */
    if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
      return true;
    }
  }
  return false;
}

/**********************************************************************
 * reduced_box_next
 *
 * Compute the bounding box of this blob with merging of x overlaps
 * but no pre-chopping.
 * Then move the iterator on to the start of the next blob.
 * DON'T reduce the box for small things - eg punctuation.
 **********************************************************************/
TBOX Textord::reduced_box_next(TO_ROW *row,    // current row
                               BLOBNBOX_IT *it // iterator to blobds
) {
  BLOBNBOX *blob;             // current blob
  BLOBNBOX *head_blob;        // place to store box
  TBOX full_box;              // full blob boundg box
  TBOX reduced_box;           // box of significant part
  int16_t left_above_xht;     // ABOVE xht left limit
  int16_t new_left_above_xht; // ABOVE xht left limit

  blob = it->data();
  if (blob->red_box_set()) {
    reduced_box = blob->reduced_box();
    do {
      it->forward();
      blob = it->data();
    } while (blob->cblob() == nullptr || blob->joined_to_prev());
    return reduced_box;
  }
  head_blob = blob;
  full_box = blob->bounding_box();
  reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
  do {
    it->forward();
    blob = it->data();
    if (blob->cblob() == nullptr) {
      // was pre-chopped
      full_box += blob->bounding_box();
    } else if (blob->joined_to_prev()) {
      reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
      left_above_xht = std::min(left_above_xht, new_left_above_xht);
    }
  }
  // until next real blob
  while (blob->cblob() == nullptr || blob->joined_to_prev());

  if ((reduced_box.width() > 0) &&
      ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
      (reduced_box.height() > 0.7 * row->xheight)) {
#ifndef GRAPHICS_DISABLED
    if (textord_show_initial_words) {
      reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
    }
#endif
  } else {
    reduced_box = full_box;
  }
  head_blob->set_reduced_box(reduced_box);
  return reduced_box;
}

/*************************************************************************
 * reduced_box_for_blob()
 * Find box for blob which is the same height and y position as the whole blob,
 * but whose left limit is the left most position of the blob ABOVE the
 * baseline and whose right limit is the right most position of the blob BELOW
 * the xheight.
 *
 *
 * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
 *         "home".  Perhaps we need something which say if the width ABOVE the
 *         xht alone includes the whole of the reduced width, then use the full
 *         blob box - Might still fail on italic F
 *
 *         Alternatively we could be a little less severe and only reduce the
 *         left and right edges by half the difference between the full box and
 *         the reduced box.
 *
 * NOTE that we need to rotate all the coordinates as
 * find_blob_limits finds the y min and max within a specified x band
 *************************************************************************/
TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
  float baseline;
  float blob_x_centre;
  float left_limit;
  float right_limit;
  float junk;
  TBOX blob_box;

  /* Find baseline of centre of blob */

  blob_box = blob->bounding_box();
  blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
  baseline = row->baseline.y(blob_x_centre);

  /*
Find LH limit of blob ABOVE the xht. This is so that we can detect certain
caps ht chars which should NOT have their box reduced: T, Y, V, W etc
*/
  left_limit = static_cast<float>(INT32_MAX);
  junk = static_cast<float>(-INT32_MAX);
  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
                     left_limit, junk);
  if (left_limit > junk) {
    *left_above_xht = INT16_MAX; // No area above xht
  } else {
    *left_above_xht = static_cast<int16_t>(std::floor(left_limit));
  }
  /*
Find reduced LH limit of blob - the left extent of the region ABOVE the
baseline.
*/
  left_limit = static_cast<float>(INT32_MAX);
  junk = static_cast<float>(-INT32_MAX);
  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);

  if (left_limit > junk) {
    return TBOX(); // no area within xht so return empty box
  }
  /*
Find reduced RH limit of blob - the right extent of the region BELOW the xht.
*/
  junk = static_cast<float>(INT32_MAX);
  right_limit = static_cast<float>(-INT32_MAX);
  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
                     right_limit);
  if (junk > right_limit) {
    return TBOX(); // no area within xht so return empty box
  }

  return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
              ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
}
} // namespace tesseract
author	Franz Glasner <fzglas.hg@dom66.de>
date	Fri, 19 Sep 2025 10:28:53 +0200
parents	b50eed0cc0ef
children