Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**********************************************************************
+* tospace.cpp
+*
+* Compute fuzzy word spacing thresholds for each row.
+* I.e. set :   max_nonspace
+*              space_threshold
+*              min_space
+*              kern_size
+*              space_size
+* for each row.
+* ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
+*
+* Note: functions in this file were originally not members of any
+* class or enclosed by any namespace. Now they are all static members
+* of the Textord class.
+*
+**********************************************************************/
+#include "drawtord.h"
+#include "statistc.h"
+#include "textord.h"
+#include "tovars.h"
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#define MAXSPACING 128 /*max expected spacing in pix */
+namespace tesseract {
+void Textord::to_spacing(ICOORD page_tr,       // topright of page
+TO_BLOCK_LIST *blocks // blocks on page
+) {
+TO_BLOCK_IT block_it; // iterator
+TO_BLOCK *block;      // current block;
+TO_ROW *row;          // current row
+int block_index;      // block number
+int row_index;        // row number
+// estimated width of real spaces for whole block
+int16_t block_space_gap_width;
+// estimated width of non space gaps for whole block
+int16_t block_non_space_gap_width;
+bool old_text_ord_proportional; // old fixed/prop result
+block_it.set_to_list(blocks);
+block_index = 1;
+for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+block = block_it.data();
+std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
+block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
+block_non_space_gap_width);
+// Make sure relative values of block-level space and non-space gap
+// widths are reasonable. The ratio of 1:3 is also used in
+// block_spacing_stats, to correct the block_space_gap_width.
+// Useful for arabic and hindi, when the non-space gap width is
+// often over-estimated and should not be trusted. A similar ratio
+// is found in block_spacing_stats.
+if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
+block_non_space_gap_width > block_space_gap_width / 3) {
+block_non_space_gap_width = block_space_gap_width / 3;
+}
+// row iterator
+TO_ROW_IT row_it(block->get_rows());
+row_index = 1;
+for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+row = row_it.data();
+if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
+if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
+tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
+}
+row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
+block_non_space_gap_width);
+} else {
+if ((tosp_debug_level > 0) && old_text_ord_proportional) {
+tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
+row_index, row->pitch_decision, row->fixed_pitch);
+}
+}
+#ifndef GRAPHICS_DISABLED
+if (textord_show_initial_words) {
+plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
+}
+#endif
+row_index++;
+}
+block_index++;
+}
+}
+/*************************************************************************
+* block_spacing_stats()
+*************************************************************************/
+void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
+int16_t &block_space_gap_width,    // resulting estimate
+int16_t &block_non_space_gap_width // resulting estimate
+) {
+TO_ROW *row;         // current row
+BLOBNBOX_IT blob_it; // iterator
+STATS centre_to_centre_stats(0, MAXSPACING - 1);
+// DEBUG USE ONLY
+STATS all_gap_stats(0, MAXSPACING - 1);
+STATS space_gap_stats(0, MAXSPACING - 1);
+int16_t minwidth = MAXSPACING; // narrowest blob
+TBOX blob_box;
+TBOX prev_blob_box;
+int16_t centre_to_centre;
+int16_t gap_width;
+float real_space_threshold;
+float iqr_centre_to_centre; // DEBUG USE ONLY
+float iqr_all_gap_stats;    // DEBUG USE ONLY
+int32_t end_of_row;
+int32_t row_length;
+// row iterator
+TO_ROW_IT row_it(block->get_rows());
+for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+row = row_it.data();
+if (!row->blob_list()->empty() &&
+(!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
+(row->pitch_decision == PITCH_CORR_PROP))) {
+blob_it.set_to_list(row->blob_list());
+blob_it.mark_cycle_pt();
+end_of_row = blob_it.data_relative(-1)->bounding_box().right();
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+row_length = end_of_row - blob_box.left();
+if (blob_box.width() < minwidth) {
+minwidth = blob_box.width();
+}
+prev_blob_box = blob_box;
+while (!blob_it.cycled_list()) {
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+if (blob_box.width() < minwidth) {
+minwidth = blob_box.width();
+}
+int16_t left = prev_blob_box.right();
+int16_t right = blob_box.left();
+gap_width = right - left;
+if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
+all_gap_stats.add(gap_width, 1);
+centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
+// DEBUG
+centre_to_centre_stats.add(centre_to_centre, 1);
+// DEBUG
+}
+prev_blob_box = blob_box;
+}
+}
+}
+// Inadequate samples
+if (all_gap_stats.get_total() <= 1) {
+block_non_space_gap_width = minwidth;
+block_space_gap_width = -1; // No est. space width
+// DEBUG
+old_text_ord_proportional = true;
+} else {
+/* For debug only ..... */
+iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
+iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
+old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
+/* .......For debug only */
+/*
+The median of the gaps is used as an estimate of the NON-SPACE gap width.
+This RELIES on the assumption that there are more gaps WITHIN words than
+BETWEEN words in a block
+Now try to estimate the width of a real space for all real spaces in the
+block. Do this by using a crude threshold to ignore "narrow" gaps, then
+find the median of the "wide" gaps and use this.
+*/
+block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
+// median gap
+row_it.set_to_list(block->get_rows());
+for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+row = row_it.data();
+if (!row->blob_list()->empty() &&
+(!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
+(row->pitch_decision == PITCH_CORR_PROP))) {
+real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
+tosp_init_guess_xht_mult * row->xheight);
+blob_it.set_to_list(row->blob_list());
+blob_it.mark_cycle_pt();
+end_of_row = blob_it.data_relative(-1)->bounding_box().right();
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+row_length = blob_box.left() - end_of_row;
+prev_blob_box = blob_box;
+while (!blob_it.cycled_list()) {
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+int16_t left = prev_blob_box.right();
+int16_t right = blob_box.left();
+gap_width = right - left;
+if ((gap_width > real_space_threshold) &&
+!ignore_big_gap(row, row_length, gapmap, left, right)) {
+/*
+If tosp_use_cert_spaces is enabled, the estimate of the space gap is
+restricted to obvious spaces - those wider than half the xht or
+those with wide blobs on both sides - i.e not things that are
+suspect 1's or punctuation that is sometimes widely spaced.
+*/
+if (!tosp_block_use_cert_spaces ||
+(gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
+((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
+(!tosp_narrow_blobs_not_cert ||
+(!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
+(wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
+space_gap_stats.add(gap_width, 1);
+}
+}
+prev_blob_box = blob_box;
+}
+}
+}
+// Inadequate samples
+if (space_gap_stats.get_total() <= 2) {
+block_space_gap_width = -1; // No est. space width
+} else {
+block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
+static_cast<int16_t>(3 * block_non_space_gap_width));
+}
+}
+}
+/*************************************************************************
+* row_spacing_stats()
+* Set values for min_space, max_non_space based on row stats only
+* If failure - return 0 values.
+*************************************************************************/
+void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
+int16_t block_space_gap_width,    // estimate for block
+int16_t block_non_space_gap_width // estimate for block
+) {
+// iterator
+BLOBNBOX_IT blob_it = row->blob_list();
+STATS all_gap_stats(0, MAXSPACING - 1);
+STATS cert_space_gap_stats(0, MAXSPACING - 1);
+STATS all_space_gap_stats(0, MAXSPACING - 1);
+STATS small_gap_stats(0, MAXSPACING - 1);
+TBOX blob_box;
+TBOX prev_blob_box;
+int16_t gap_width;
+int16_t real_space_threshold = 0;
+int16_t max = 0;
+int16_t large_gap_count = 0;
+bool suspected_table;
+bool good_block_space_estimate = block_space_gap_width > 0;
+int32_t end_of_row;
+int32_t row_length = 0;
+float sane_space;
+int32_t sane_threshold;
+/* Collect first pass stats for row */
+if (!good_block_space_estimate) {
+block_space_gap_width = int16_t(std::floor(row->xheight / 2));
+}
+if (!row->blob_list()->empty()) {
+if (tosp_threshold_bias1 > 0) {
+real_space_threshold =
+block_non_space_gap_width +
+int16_t(floor(0.5 + tosp_threshold_bias1 *
+(block_space_gap_width - block_non_space_gap_width)));
+} else {
+real_space_threshold = // Old TO method
+(block_space_gap_width + block_non_space_gap_width) / 2;
+}
+blob_it.set_to_list(row->blob_list());
+blob_it.mark_cycle_pt();
+end_of_row = blob_it.data_relative(-1)->bounding_box().right();
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+row_length = end_of_row - blob_box.left();
+prev_blob_box = blob_box;
+while (!blob_it.cycled_list()) {
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+int16_t left = prev_blob_box.right();
+int16_t right = blob_box.left();
+gap_width = right - left;
+if (ignore_big_gap(row, row_length, gapmap, left, right)) {
+large_gap_count++;
+} else {
+if (gap_width >= real_space_threshold) {
+if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
+((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
+(!tosp_narrow_blobs_not_cert ||
+(!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
+(wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
+cert_space_gap_stats.add(gap_width, 1);
+}
+all_space_gap_stats.add(gap_width, 1);
+} else {
+small_gap_stats.add(gap_width, 1);
+}
+all_gap_stats.add(gap_width, 1);
+}
+prev_blob_box = blob_box;
+}
+}
+suspected_table = (large_gap_count > 1) ||
+((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
+/* Now determine row kern size, space size and threshold */
+if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
+((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
+cert_space_gap_stats.get_total() > 0)) {
+old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
+block_space_gap_width, block_non_space_gap_width);
+} else {
+if (!tosp_recovery_isolated_row_stats ||
+!isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
+if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
+tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
+}
+if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
+// Use block default
+row->space_size = block_space_gap_width;
+if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
+row->kern_size = all_gap_stats.median();
+} else {
+row->kern_size = block_non_space_gap_width;
+}
+row->space_threshold =
+int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
+} else {
+old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
+block_space_gap_width, block_non_space_gap_width);
+}
+}
+}
+if (tosp_improve_thresh && !suspected_table) {
+improve_row_threshold(row, &all_gap_stats);
+}
+/* Now lets try to be careful not to do anything silly with tables when we
+are ignoring big gaps*/
+if (tosp_sanity_method == 0) {
+if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
+row->kern_size, row->space_threshold, row->space_size);
+}
+row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
+row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
+}
+} else if (tosp_sanity_method == 1) {
+sane_space = row->space_size;
+/* NEVER let space size get too close to kern size */
+if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
+((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
+if (good_block_space_estimate &&
+(block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
+sane_space = block_space_gap_width;
+} else {
+sane_space =
+std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
+row->xheight / 2.0f);
+}
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
+row->kern_size, row->space_threshold, row->space_size, sane_space);
+}
+row->space_size = sane_space;
+row->space_threshold =
+int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
+}
+/* NEVER let threshold get VERY far away from kern */
+sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
+if (row->space_threshold > sane_threshold) {
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
+row->kern_size, row->space_threshold, row->space_size, sane_threshold);
+}
+row->space_threshold = sane_threshold;
+if (row->space_size <= sane_threshold) {
+row->space_size = row->space_threshold + 1.0f;
+}
+}
+/* Beware of tables - there may be NO spaces */
+if (suspected_table) {
+sane_space =
+std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
+sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
+if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
+row->kern_size, row->space_threshold, row->space_size);
+}
+// the minimum sane value
+row->space_threshold = static_cast<int32_t>(sane_space);
+row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
+}
+}
+}
+/* Now lets try to put some error limits on the threshold */
+if (tosp_old_to_method) {
+/* Old textord made a space if gap >= threshold */
+// NO FUZZY SPACES YET
+row->max_nonspace = row->space_threshold;
+// NO FUZZY SPACES       YET
+row->min_space = row->space_threshold + 1;
+} else {
+/* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
+row->min_space =
+std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
+if (row->min_space <= row->space_threshold) {
+// Don't be silly
+row->min_space = row->space_threshold + 1;
+}
+/*
+Lets try to guess the max certain kern gap by looking at the cluster of
+kerns for the row. The row is proportional so the kerns should cluster
+tightly at the bottom of the distribution. We also expect most gaps to be
+kerns. Find the maximum of the kern piles between 0 and twice the kern
+estimate. Piles before the first one with less than 1/10 the maximum
+number of samples can be taken as certain kerns.
+Of course, there are some cases where the kern peak and space peaks merge,
+so we will put an UPPER limit on the max certain kern gap of some fraction
+below the threshold.
+*/
+// upper bound
+int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
+// default
+row->max_nonspace = max_max_nonspace;
+for (int32_t index = 0; index <= max_max_nonspace; index++) {
+if (all_gap_stats.pile_count(index) > max) {
+max = all_gap_stats.pile_count(index);
+}
+if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
+row->max_nonspace = index;
+break;
+}
+}
+}
+/* Yet another algorithm - simpler this time - just choose a fraction of the
+threshold to space range */
+if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
+row->min_space = std::max(
+row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
+tosp_fuzzy_sp_fraction *
+(row->space_size - row->space_threshold))));
+}
+/* Ensure that ANY space less than some multiplier times the kern size is
+fuzzy.  In tables there is a risk of erroneously setting a small space size
+when there are no real spaces. Sometimes tables have text squashed into
+columns so that the kn->sp ratio is small anyway - this means that we can't
+use this to force a wider separation - hence we rely on context to join any
+dubious breaks. */
+if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
+row->min_space = std::max(
+row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
+}
+if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
+row->max_nonspace = static_cast<int32_t>(floor(
+0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
+}
+if (row->max_nonspace > row->space_threshold) {
+// Don't be silly
+row->max_nonspace = row->space_threshold;
+}
+if (tosp_debug_level > 5) {
+tprintf(
+"B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
+"Sp:%3.2f\n",
+block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
+real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
+row->min_space, row->space_size);
+}
+if (tosp_debug_level > 10) {
+tprintf(
+"row->kern_size = %3.2f, row->space_size = %3.2f, "
+"row->space_threshold = %d\n",
+row->kern_size, row->space_size, row->space_threshold);
+}
+}
+void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
+STATS *small_gap_stats,
+int16_t block_space_gap_width,    // estimate for block
+int16_t block_non_space_gap_width // estimate for block
+) {
+/* First, estimate row space size */
+/* Old to condition was > 2 */
+if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
+// Adequate samples
+/* Set space size to median of spaces BUT limits it if it seems wildly out
+*/
+row->space_size = space_gap_stats->median();
+if (row->space_size > block_space_gap_width * 1.5) {
+if (tosp_old_to_bug_fix) {
+row->space_size = block_space_gap_width * 1.5;
+} else {
+// BUG??? should be *1.5
+row->space_size = block_space_gap_width;
+}
+}
+if (row->space_size < (block_non_space_gap_width * 2) + 1) {
+row->space_size = (block_non_space_gap_width * 2) + 1;
+}
+}
+// Only 1 or 2 samples
+else if (space_gap_stats->get_total() >= 1) {
+// hence mean not median
+row->space_size = space_gap_stats->mean();
+if (row->space_size > block_space_gap_width * 1.5) {
+if (tosp_old_to_bug_fix) {
+row->space_size = block_space_gap_width * 1.5;
+} else {
+// BUG??? should be *1.5
+row->space_size = block_space_gap_width;
+}
+}
+if (row->space_size < (block_non_space_gap_width * 3) + 1) {
+row->space_size = (block_non_space_gap_width * 3) + 1;
+}
+} else {
+// Use block default
+row->space_size = block_space_gap_width;
+}
+/* Next, estimate row kern size */
+if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
+row->kern_size = small_gap_stats->median();
+} else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
+row->kern_size = all_gap_stats->median();
+} else { // old TO -SAME FOR ALL ROWS
+row->kern_size = block_non_space_gap_width;
+}
+/* Finally, estimate row space threshold */
+if (tosp_threshold_bias2 > 0) {
+row->space_threshold = int32_t(
+floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
+} else {
+/*
+NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold
+and holds this in a float. The use is with a >= test
+NEW textord uses an integer threshold and a > test
+It comes to the same thing.
+(Though there is a difference in that old textor has integer space_size
+and kern_size.)
+*/
+row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
+}
+// Apply the same logic and ratios as in row_spacing_stats to
+// restrict relative values of the row's space_size, kern_size, and
+// space_threshold
+if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
+((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
+((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
+if (row->kern_size > 2.5) {
+row->kern_size = row->space_size / tosp_min_sane_kn_sp;
+}
+row->space_threshold =
+int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
+}
+}
+/*************************************************************************
+* isolated_row_stats()
+* Set values for min_space, max_non_space based on row stats only
+*************************************************************************/
+bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
+bool suspected_table, int16_t block_idx, int16_t row_idx) {
+float kern_estimate;
+float crude_threshold_estimate;
+int16_t small_gaps_count;
+int16_t total;
+// iterator
+BLOBNBOX_IT blob_it = row->blob_list();
+STATS cert_space_gap_stats(0, MAXSPACING - 1);
+STATS all_space_gap_stats(0, MAXSPACING - 1);
+STATS small_gap_stats(0, MAXSPACING - 1);
+TBOX blob_box;
+TBOX prev_blob_box;
+int16_t gap_width;
+int32_t end_of_row;
+int32_t row_length;
+kern_estimate = all_gap_stats->median();
+crude_threshold_estimate =
+std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
+small_gaps_count =
+stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
+total = all_gap_stats->get_total();
+if ((total <= tosp_redo_kern_limit) ||
+((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
+(total - small_gaps_count < 1)) {
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
+}
+return false;
+}
+blob_it.set_to_list(row->blob_list());
+blob_it.mark_cycle_pt();
+end_of_row = blob_it.data_relative(-1)->bounding_box().right();
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+row_length = end_of_row - blob_box.left();
+prev_blob_box = blob_box;
+while (!blob_it.cycled_list()) {
+if (tosp_use_pre_chopping) {
+blob_box = box_next_pre_chopped(&blob_it);
+} else if (tosp_stats_use_xht_gaps) {
+blob_box = reduced_box_next(row, &blob_it);
+} else {
+blob_box = box_next(&blob_it);
+}
+int16_t left = prev_blob_box.right();
+int16_t right = blob_box.left();
+gap_width = right - left;
+if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
+(gap_width > crude_threshold_estimate)) {
+if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
+((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
+(!tosp_narrow_blobs_not_cert ||
+(!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
+(wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
+cert_space_gap_stats.add(gap_width, 1);
+}
+all_space_gap_stats.add(gap_width, 1);
+}
+if (gap_width < crude_threshold_estimate) {
+small_gap_stats.add(gap_width, 1);
+}
+prev_blob_box = blob_box;
+}
+if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
+// median
+row->space_size = cert_space_gap_stats.median();
+} else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
+// to avoid spaced
+row->space_size = cert_space_gap_stats.mean();
+//      1's in tables
+} else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
+// median
+row->space_size = all_space_gap_stats.median();
+} else {
+row->space_size = all_space_gap_stats.mean();
+}
+if (tosp_only_small_gaps_for_kern) {
+row->kern_size = small_gap_stats.median();
+} else {
+row->kern_size = all_gap_stats->median();
+}
+row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
+/* Sanity check */
+if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
+(row->space_threshold <= 0)) {
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
+row->kern_size, row->space_threshold, row->space_size);
+}
+row->kern_size = 0.0f;
+row->space_threshold = 0;
+row->space_size = 0.0f;
+return false;
+}
+if (tosp_debug_level > 5) {
+tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
+row->space_threshold, row->space_size);
+}
+return true;
+}
+int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
+int16_t index;
+int16_t total = 0;
+for (index = 0; index < threshold; index++) {
+total += stats->pile_count(index);
+}
+return total;
+}
+/*************************************************************************
+* improve_row_threshold()
+*    Try to recognise a "normal line" -
+*           > 25 gaps
+*     &&    space > 3 * kn  && space > 10
+*              (I.e. reasonably large space and kn:sp ratio)
+*     &&    > 3/4 # gaps < kn + (sp - kn)/3
+*              (I.e. most gaps are well away from space estimate)
+*     &&    a gap of max(3, (sp - kn) / 3) empty histogram positions is found
+*           somewhere in the histogram between kn and sp
+*     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
+*          NO!!!!! the bristol line has "11" with a gap of 12 between the
+*1's!!! try moving the default threshold to within this band but leave the
+*          fuzzy limit calculation as at present.
+*************************************************************************/
+void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
+float sp = row->space_size;
+float kn = row->kern_size;
+int16_t reqd_zero_width = 0;
+int16_t zero_width = 0;
+int16_t zero_start = 0;
+int16_t index = 0;
+if (tosp_debug_level > 10) {
+tprintf("Improve row threshold 0");
+}
+if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
+(stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
+(0.75 * all_gap_stats->get_total()))) {
+return;
+}
+if (tosp_debug_level > 10) {
+tprintf(" 1");
+}
+/*
+Look for the first region of all 0's in the histogram which is wider than
+max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
+threshold is not within it, move the threshold so that is just inside it.
+*/
+reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
+if (reqd_zero_width < 3) {
+reqd_zero_width = 3;
+}
+for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
+if (all_gap_stats->pile_count(index) == 0) {
+if (zero_width == 0) {
+zero_start = index;
+}
+zero_width++;
+} else {
+if (zero_width >= reqd_zero_width) {
+break;
+} else {
+zero_width = 0;
+}
+}
+}
+index--;
+if (tosp_debug_level > 10) {
+tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
+zero_width, zero_start, row->space_threshold);
+}
+if ((zero_width < reqd_zero_width) ||
+((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
+return;
+}
+if (tosp_debug_level > 10) {
+tprintf(" 2");
+}
+if (row->space_threshold < zero_start) {
+if (tosp_debug_level > 5) {
+tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
+index, row->space_threshold, zero_start);
+}
+row->space_threshold = zero_start;
+}
+if (row->space_threshold > index) {
+if (tosp_debug_level > 5) {
+tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
+index, row->space_threshold, index);
+}
+row->space_threshold = index;
+}
+}
+/**********************************************************************
+* make_prop_words
+*
+* Convert a TO_ROW to a ROW.
+**********************************************************************/
+ROW *Textord::make_prop_words(TO_ROW *row,    // row to make
+FCOORD rotation // for drawing
+) {
+bool bol; // start of line
+/* prev_ values are for start of word being built. non prev_ values are for
+the gap between the word being built and the next one. */
+bool prev_fuzzy_sp;     // probably space
+bool prev_fuzzy_non;    // probably not
+uint8_t prev_blanks;    // in front of word
+bool fuzzy_sp = false;  // probably space
+bool fuzzy_non = false; // probably not
+uint8_t blanks = 0;     // in front of word
+bool prev_gap_was_a_space = false;
+bool break_at_next_gap = false;
+ROW *real_row; // output row
+C_OUTLINE_IT cout_it;
+C_BLOB_LIST cblobs;
+C_BLOB_IT cblob_it = &cblobs;
+WERD_LIST words;
+WERD *word; // new word
+int32_t next_rep_char_word_right = INT32_MAX;
+float repetition_spacing; // gap between repetitions
+int32_t xstarts[2];       // row ends
+int32_t prev_x;           // end of prev blob
+BLOBNBOX_IT box_it;       // iterator
+TBOX prev_blob_box;
+TBOX next_blob_box;
+int16_t prev_gap = INT16_MAX;
+int16_t current_gap = INT16_MAX;
+int16_t next_gap = INT16_MAX;
+int16_t prev_within_xht_gap = INT16_MAX;
+int16_t current_within_xht_gap = INT16_MAX;
+int16_t next_within_xht_gap = INT16_MAX;
+int16_t word_count = 0;
+// repeated char words
+WERD_IT rep_char_it(&(row->rep_words));
+if (!rep_char_it.empty()) {
+next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
+}
+prev_x = -INT16_MAX;
+cblob_it.set_to_list(&cblobs);
+box_it.set_to_list(row->blob_list());
+// new words
+WERD_IT word_it(&words);
+bol = true;
+prev_blanks = 0;
+prev_fuzzy_sp = false;
+prev_fuzzy_non = false;
+if (!box_it.empty()) {
+xstarts[0] = box_it.data()->bounding_box().left();
+if (xstarts[0] > next_rep_char_word_right) {
+/* We need to insert a repeated char word at the start of the row */
+word = rep_char_it.extract();
+word_it.add_after_then_move(word);
+/* Set spaces before repeated char word */
+word->set_flag(W_BOL, true);
+bol = false;
+word->set_blanks(0);
+// NO uncertainty
+word->set_flag(W_FUZZY_SP, false);
+word->set_flag(W_FUZZY_NON, false);
+xstarts[0] = word->bounding_box().left();
+/* Set spaces after repeated char word (and leave current word set) */
+repetition_spacing = find_mean_blob_spacing(word);
+current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
+current_within_xht_gap = current_gap;
+if (current_gap > tosp_rep_space * repetition_spacing) {
+prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
+if (prev_blanks < 1) {
+prev_blanks = 1;
+}
+} else {
+prev_blanks = 0;
+}
+if (tosp_debug_level > 5) {
+tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
+box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
+repetition_spacing, current_gap);
+}
+prev_fuzzy_sp = false;
+prev_fuzzy_non = false;
+if (rep_char_it.empty()) {
+next_rep_char_word_right = INT32_MAX;
+} else {
+rep_char_it.forward();
+next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
+}
+}
+peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
+do {
+auto bblob = box_it.data();
+auto blob_box = bblob->bounding_box();
+if (bblob->joined_to_prev()) {
+auto cblob = bblob->remove_cblob();
+if (cblob != nullptr) {
+cout_it.set_to_list(cblob_it.data()->out_list());
+cout_it.move_to_last();
+cout_it.add_list_after(cblob->out_list());
+delete cblob;
+}
+} else {
+auto cblob = bblob->cblob();
+if (cblob != nullptr) {
+bblob->set_owns_cblob(false);
+cblob_it.add_after_then_move(cblob);
+}
+prev_x = blob_box.right();
+}
+box_it.forward(); // next one
+bblob = box_it.data();
+blob_box = bblob->bounding_box();
+if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
+/* Real Blob - not multiple outlines or pre-chopped */
+prev_gap = current_gap;
+prev_within_xht_gap = current_within_xht_gap;
+prev_blob_box = next_blob_box;
+current_gap = next_gap;
+current_within_xht_gap = next_within_xht_gap;
+peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
+int16_t prev_gap_arg = prev_gap;
+int16_t next_gap_arg = next_gap;
+if (tosp_only_use_xht_gaps) {
+prev_gap_arg = prev_within_xht_gap;
+next_gap_arg = next_within_xht_gap;
+}
+// Decide if a word-break should be inserted
+if (blob_box.left() > next_rep_char_word_right ||
+make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
+current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
+fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
+box_it.at_first()) {
+/* Form a new word out of the blobs collected */
+word = new WERD(&cblobs, prev_blanks, nullptr);
+word_count++;
+word_it.add_after_then_move(word);
+if (bol) {
+word->set_flag(W_BOL, true);
+bol = false;
+}
+if (prev_fuzzy_sp) {
+// probably space
+word->set_flag(W_FUZZY_SP, true);
+} else if (prev_fuzzy_non) {
+word->set_flag(W_FUZZY_NON, true);
+}
+// probably not
+if (blob_box.left() > next_rep_char_word_right) {
+/* We need to insert a repeated char word */
+word = rep_char_it.extract();
+word_it.add_after_then_move(word);
+/* Set spaces before repeated char word */
+repetition_spacing = find_mean_blob_spacing(word);
+current_gap = word->bounding_box().left() - prev_x;
+current_within_xht_gap = current_gap;
+if (current_gap > tosp_rep_space * repetition_spacing) {
+blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
+if (blanks < 1) {
+blanks = 1;
+}
+} else {
+blanks = 0;
+}
+if (tosp_debug_level > 5) {
+tprintf("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
+word->bounding_box().left(), word->bounding_box().bottom(),
+repetition_spacing, current_gap, blanks);
+}
+word->set_blanks(blanks);
+// NO uncertainty
+word->set_flag(W_FUZZY_SP, false);
+word->set_flag(W_FUZZY_NON, false);
+/* Set spaces after repeated char word (and leave current word set)
+*/
+current_gap = blob_box.left() - next_rep_char_word_right;
+if (current_gap > tosp_rep_space * repetition_spacing) {
+blanks = static_cast<uint8_t>(current_gap / row->space_size);
+if (blanks < 1) {
+blanks = 1;
+}
+} else {
+blanks = 0;
+}
+if (tosp_debug_level > 5) {
+tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
+}
+fuzzy_sp = false;
+fuzzy_non = false;
+if (rep_char_it.empty()) {
+next_rep_char_word_right = INT32_MAX;
+} else {
+rep_char_it.forward();
+next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
+}
+}
+if (box_it.at_first() && rep_char_it.empty()) {
+// at end of line
+word->set_flag(W_EOL, true);
+xstarts[1] = prev_x;
+} else {
+prev_blanks = blanks;
+prev_fuzzy_sp = fuzzy_sp;
+prev_fuzzy_non = fuzzy_non;
+}
+}
+}
+} while (!box_it.at_first()); // until back at start
+/* Insert any further repeated char words */
+while (!rep_char_it.empty()) {
+word = rep_char_it.extract();
+word_it.add_after_then_move(word);
+/* Set spaces before repeated char word */
+repetition_spacing = find_mean_blob_spacing(word);
+current_gap = word->bounding_box().left() - prev_x;
+if (current_gap > tosp_rep_space * repetition_spacing) {
+blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
+if (blanks < 1) {
+blanks = 1;
+}
+} else {
+blanks = 0;
+}
+if (tosp_debug_level > 5) {
+tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
+word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
+current_gap, blanks);
+}
+word->set_blanks(blanks);
+// NO uncertainty
+word->set_flag(W_FUZZY_SP, false);
+word->set_flag(W_FUZZY_NON, false);
+prev_x = word->bounding_box().right();
+if (rep_char_it.empty()) {
+// at end of line
+word->set_flag(W_EOL, true);
+xstarts[1] = prev_x;
+} else {
+rep_char_it.forward();
+}
+}
+real_row =
+new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
+word_it.set_to_list(real_row->word_list());
+// put words in row
+word_it.add_list_after(&words);
+real_row->recalc_bounding_box();
+if (tosp_debug_level > 4) {
+tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
+real_row->bounding_box().left(), real_row->bounding_box().bottom(),
+real_row->bounding_box().right(), real_row->bounding_box().top());
+}
+return real_row;
+}
+return nullptr;
+}
+/**********************************************************************
+* make_blob_words
+*
+* Converts words into blobs so that each blob is a single character.
+*  Used for chopper test.
+**********************************************************************/
+ROW *Textord::make_blob_words(TO_ROW *row,    // row to make
+FCOORD rotation // for drawing
+) {
+bool bol;      // start of line
+ROW *real_row; // output row
+C_OUTLINE_IT cout_it;
+C_BLOB_LIST cblobs;
+C_BLOB_IT cblob_it = &cblobs;
+WERD_LIST words;
+WERD *word;         // new word
+BLOBNBOX_IT box_it; // iterator
+int16_t word_count = 0;
+cblob_it.set_to_list(&cblobs);
+box_it.set_to_list(row->blob_list());
+// new words
+WERD_IT word_it(&words);
+bol = true;
+if (!box_it.empty()) {
+do {
+auto bblob = box_it.data();
+auto blob_box = bblob->bounding_box();
+if (bblob->joined_to_prev()) {
+auto cblob = bblob->remove_cblob();
+if (cblob != nullptr) {
+cout_it.set_to_list(cblob_it.data()->out_list());
+cout_it.move_to_last();
+cout_it.add_list_after(cblob->out_list());
+delete cblob;
+}
+} else {
+auto cblob = bblob->cblob();
+if (cblob != nullptr) {
+bblob->set_owns_cblob(false);
+cblob_it.add_after_then_move(cblob);
+}
+}
+box_it.forward(); // next one
+bblob = box_it.data();
+blob_box = bblob->bounding_box();
+if (!bblob->joined_to_prev() && !cblobs.empty()) {
+word = new WERD(&cblobs, 1, nullptr);
+word_count++;
+word_it.add_after_then_move(word);
+if (bol) {
+word->set_flag(W_BOL, true);
+bol = false;
+}
+if (box_it.at_first()) { // at end of line
+word->set_flag(W_EOL, true);
+}
+}
+} while (!box_it.at_first()); // until back at start
+/* Setup the row with created words. */
+real_row =
+new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
+word_it.set_to_list(real_row->word_list());
+// put words in row
+word_it.add_list_after(&words);
+real_row->recalc_bounding_box();
+if (tosp_debug_level > 4) {
+tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
+real_row->bounding_box().left(), real_row->bounding_box().bottom(),
+real_row->bounding_box().right(), real_row->bounding_box().top());
+}
+return real_row;
+}
+return nullptr;
+}
+bool Textord::make_a_word_break(TO_ROW *row,   // row being made
+TBOX blob_box, // for next_blob // how many blanks?
+int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
+int16_t within_xht_current_gap, TBOX next_blob_box,
+int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
+bool &prev_gap_was_a_space, bool &break_at_next_gap) {
+bool space;
+int16_t current_gap;
+float fuzzy_sp_to_kn_limit;
+if (break_at_next_gap) {
+break_at_next_gap = false;
+return true;
+}
+/* Inhibit using the reduced gap if
+The kerning is large - chars are not kerned and reducing "f"s can cause
+erroneous blanks
+OR  The real gap is less than 0
+OR  The real gap is less than the kerning estimate
+*/
+if ((row->kern_size > tosp_large_kerning * row->xheight) ||
+((tosp_dont_fool_with_small_kerns >= 0) &&
+(real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
+// Ignore the difference
+within_xht_current_gap = real_current_gap;
+}
+if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
+current_gap = within_xht_current_gap;
+} else {
+current_gap = real_current_gap;
+}
+if (tosp_old_to_method) {
+// Boring old method
+space = current_gap > row->max_nonspace;
+if (space && (current_gap < INT16_MAX)) {
+if (current_gap < row->min_space) {
+if (current_gap > row->space_threshold) {
+blanks = 1;
+fuzzy_sp = true;
+fuzzy_non = false;
+} else {
+blanks = 0;
+fuzzy_sp = false;
+fuzzy_non = true;
+}
+} else {
+if (row->space_size == 0.0f) {
+// Avoid FP division by 0.
+blanks = 1;
+} else {
+blanks = static_cast<uint8_t>(current_gap / row->space_size);
+if (blanks < 1) {
+blanks = 1;
+}
+}
+fuzzy_sp = false;
+fuzzy_non = false;
+}
+}
+return space;
+} else {
+/* New exciting heuristic method */
+if (prev_blob_box.null_box()) { // Beginning of row
+prev_gap_was_a_space = true;
+}
+// Default as old TO
+space = current_gap > row->space_threshold;
+/* Set defaults for the word break in case we find one.  Currently there are
+no fuzzy spaces. Depending on the reliability of the different heuristics
+we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
+be used if the function returns true - ie the word is to be broken.
+*/
+int num_blanks = current_gap;
+if (row->space_size > 1.0f) {
+num_blanks = IntCastRounded(current_gap / row->space_size);
+}
+blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
+fuzzy_sp = false;
+fuzzy_non = false;
+/*
+If xht measure causes gap to flip one of the 3 thresholds act accordingly -
+despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
+context.
+*/
+if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
+(within_xht_current_gap > row->max_nonspace)) {
+space = true;
+fuzzy_non = true;
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
+(within_xht_current_gap > row->space_threshold)) {
+space = true;
+if (tosp_flip_fuzz_kn_to_sp) {
+fuzzy_sp = true;
+} else {
+fuzzy_non = true;
+}
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
+(within_xht_current_gap >= row->min_space)) {
+space = true;
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
+suspected_punct_blob(row, blob_box)) {
+break_at_next_gap = true;
+}
+/* Now continue with normal heuristics */
+else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
+/* Heuristics to turn dubious spaces to kerns */
+if (tosp_pass_wide_fuzz_sp_to_context > 0) {
+fuzzy_sp_to_kn_limit =
+row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
+} else {
+fuzzy_sp_to_kn_limit = 99999.0f;
+}
+/* If current gap is significantly smaller than the previous space the
+other side of a narrow blob then this gap is a kern. */
+if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
+(current_gap <= tosp_gap_factor * prev_gap)) {
+if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
+if (tosp_flip_fuzz_sp_to_kn) {
+fuzzy_non = true;
+} else {
+fuzzy_sp = true;
+}
+} else {
+space = false;
+}
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+}
+/* If current gap not much bigger than the previous kern the other side of
+a narrow blob then this gap is a kern as well */
+else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
+!prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
+if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
+if (tosp_flip_fuzz_sp_to_kn) {
+fuzzy_non = true;
+} else {
+fuzzy_sp = true;
+}
+} else {
+space = false;
+}
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
+(next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
+if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
+if (tosp_flip_fuzz_sp_to_kn) {
+fuzzy_non = true;
+} else {
+fuzzy_sp = true;
+}
+} else {
+space = false;
+}
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
+(next_gap <= row->space_threshold) &&
+(current_gap * tosp_gap_factor <= next_gap)) {
+if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
+if (tosp_flip_fuzz_sp_to_kn) {
+fuzzy_non = true;
+} else {
+fuzzy_sp = true;
+}
+} else {
+space = false;
+}
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
+((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
+fuzzy_sp = true;
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+}
+} else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
+/* Heuristics to turn dubious kerns to spaces */
+/* TRIED THIS BUT IT MADE THINGS WORSE
+if (prev_gap == INT16_MAX)
+prev_gap = 0;  // start of row
+if (next_gap == INT16_MAX)
+next_gap = 0;  // end of row
+*/
+if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
+(current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
+wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
+space = true;
+/*
+tosp_flip_caution is an attempt to stop the default changing in cases
+where there is a large difference between the kern and space estimates.
+See problem in 'chiefs' where "have" gets split in the quotation.
+*/
+if ((tosp_flip_fuzz_kn_to_sp) &&
+((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
+fuzzy_sp = true;
+} else {
+fuzzy_non = true;
+}
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
+current_gap > 5 && // Rule 9 handles small gap, big ratio.
+current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
+!(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
+!(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
+space = true;
+fuzzy_non = true;
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+} else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
+(next_blob_box.width() > 0) &&
+(current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
+(!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
+!suspected_punct_blob(row, next_blob_box)))) {
+space = true;
+fuzzy_non = true;
+#ifndef GRAPHICS_DISABLED
+mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
+next_gap);
+#endif
+}
+}
+if (tosp_debug_level > 10) {
+tprintf(
+"word break = %d current_gap = %d, prev_gap = %d, "
+"next_gap = %d\n",
+space ? 1 : 0, current_gap, prev_gap, next_gap);
+}
+prev_gap_was_a_space = space && !(fuzzy_non);
+return space;
+}
+}
+bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
+bool result;
+result =
+((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
+((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
+return result;
+}
+bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
+bool result;
+if (tosp_wide_fraction > 0) {
+if (tosp_wide_aspect_ratio > 0) {
+result =
+((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
+((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
+} else {
+result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
+}
+} else {
+result = !narrow_blob(row, blob_box);
+}
+return result;
+}
+bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
+bool result;
+float baseline;
+float blob_x_centre;
+/* Find baseline of centre of blob */
+blob_x_centre = (box.right() + box.left()) / 2.0;
+baseline = row->baseline.y(blob_x_centre);
+result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
+(box.bottom() > baseline + row->xheight / 2.0);
+return result;
+}
+void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
+int16_t &next_gap, int16_t &next_within_xht_gap) {
+TBOX next_reduced_blob_box;
+TBOX bit_beyond;
+BLOBNBOX_IT reduced_box_it = box_it;
+next_blob_box = box_next(&box_it);
+next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
+if (box_it.at_first()) {
+next_gap = INT16_MAX;
+next_within_xht_gap = INT16_MAX;
+} else {
+bit_beyond = box_it.data()->bounding_box();
+next_gap = bit_beyond.left() - next_blob_box.right();
+bit_beyond = reduced_box_next(row, &reduced_box_it);
+next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
+}
+}
+#ifndef GRAPHICS_DISABLED
+void Textord::mark_gap(TBOX blob,    // blob following gap
+int16_t rule, // heuristic id
+int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
+int16_t next_blob_width, int16_t next_gap) {
+ScrollView::Color col; // of ellipse marking flipped gap
+switch (rule) {
+case 1:
+col = ScrollView::RED;
+break;
+case 2:
+col = ScrollView::CYAN;
+break;
+case 3:
+col = ScrollView::GREEN;
+break;
+case 4:
+col = ScrollView::BLACK;
+break;
+case 5:
+col = ScrollView::MAGENTA;
+break;
+case 6:
+col = ScrollView::BLUE;
+break;
+case 7:
+col = ScrollView::WHITE;
+break;
+case 8:
+col = ScrollView::YELLOW;
+break;
+case 9:
+col = ScrollView::BLACK;
+break;
+case 20:
+col = ScrollView::CYAN;
+break;
+case 21:
+col = ScrollView::GREEN;
+break;
+case 22:
+col = ScrollView::MAGENTA;
+break;
+default:
+col = ScrollView::BLACK;
+}
+if (textord_show_initial_words) {
+to_win->Pen(col);
+/*  if (rule < 20)
+//interior_style(to_win, INT_SOLID, false);
+else
+//interior_style(to_win, INT_HOLLOW, true);*/
+// x radius
+to_win->Ellipse(current_gap / 2.0f,
+blob.height() / 2.0f, // y radius
+// x centre
+blob.left() - current_gap / 2.0f,
+// y centre
+blob.bottom() + blob.height() / 2.0f);
+}
+if (tosp_debug_level > 5) {
+tprintf("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
+blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
+}
+}
+#endif
+float Textord::find_mean_blob_spacing(WERD *word) {
+C_BLOB_IT cblob_it;
+TBOX blob_box;
+int32_t gap_sum = 0;
+int16_t gap_count = 0;
+int16_t prev_right;
+cblob_it.set_to_list(word->cblob_list());
+if (!cblob_it.empty()) {
+cblob_it.mark_cycle_pt();
+prev_right = cblob_it.data()->bounding_box().right();
+// first blob
+cblob_it.forward();
+for (; !cblob_it.cycled_list(); cblob_it.forward()) {
+blob_box = cblob_it.data()->bounding_box();
+gap_sum += blob_box.left() - prev_right;
+gap_count++;
+prev_right = blob_box.right();
+}
+}
+if (gap_count > 0) {
+return (gap_sum / static_cast<float>(gap_count));
+} else {
+return 0.0f;
+}
+}
+bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
+int16_t right) {
+int16_t gap = right - left + 1;
+if (tosp_ignore_big_gaps > 999) {
+return false; // Don't ignore
+}
+if (tosp_ignore_big_gaps > 0) {
+return (gap > tosp_ignore_big_gaps * row->xheight);
+}
+if (gap > tosp_ignore_very_big_gaps * row->xheight) {
+return true;
+}
+if (tosp_ignore_big_gaps == 0) {
+if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
+return true;
+}
+if ((gap > 1.75 * row->xheight) &&
+((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
+return true;
+}
+} else {
+/* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
+*/
+if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
+return true;
+}
+}
+return false;
+}
+/**********************************************************************
+* reduced_box_next
+*
+* Compute the bounding box of this blob with merging of x overlaps
+* but no pre-chopping.
+* Then move the iterator on to the start of the next blob.
+* DON'T reduce the box for small things - eg punctuation.
+**********************************************************************/
+TBOX Textord::reduced_box_next(TO_ROW *row,    // current row
+BLOBNBOX_IT *it // iterator to blobds
+) {
+BLOBNBOX *blob;             // current blob
+BLOBNBOX *head_blob;        // place to store box
+TBOX full_box;              // full blob boundg box
+TBOX reduced_box;           // box of significant part
+int16_t left_above_xht;     // ABOVE xht left limit
+int16_t new_left_above_xht; // ABOVE xht left limit
+blob = it->data();
+if (blob->red_box_set()) {
+reduced_box = blob->reduced_box();
+do {
+it->forward();
+blob = it->data();
+} while (blob->cblob() == nullptr || blob->joined_to_prev());
+return reduced_box;
+}
+head_blob = blob;
+full_box = blob->bounding_box();
+reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
+do {
+it->forward();
+blob = it->data();
+if (blob->cblob() == nullptr) {
+// was pre-chopped
+full_box += blob->bounding_box();
+} else if (blob->joined_to_prev()) {
+reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
+left_above_xht = std::min(left_above_xht, new_left_above_xht);
+}
+}
+// until next real blob
+while (blob->cblob() == nullptr || blob->joined_to_prev());
+if ((reduced_box.width() > 0) &&
+((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
+(reduced_box.height() > 0.7 * row->xheight)) {
+#ifndef GRAPHICS_DISABLED
+if (textord_show_initial_words) {
+reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
+}
+#endif
+} else {
+reduced_box = full_box;
+}
+head_blob->set_reduced_box(reduced_box);
+return reduced_box;
+}
+/*************************************************************************
+* reduced_box_for_blob()
+* Find box for blob which is the same height and y position as the whole blob,
+* but whose left limit is the left most position of the blob ABOVE the
+* baseline and whose right limit is the right most position of the blob BELOW
+* the xheight.
+*
+*
+* !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
+*         "home".  Perhaps we need something which say if the width ABOVE the
+*         xht alone includes the whole of the reduced width, then use the full
+*         blob box - Might still fail on italic F
+*
+*         Alternatively we could be a little less severe and only reduce the
+*         left and right edges by half the difference between the full box and
+*         the reduced box.
+*
+* NOTE that we need to rotate all the coordinates as
+* find_blob_limits finds the y min and max within a specified x band
+*************************************************************************/
+TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
+float baseline;
+float blob_x_centre;
+float left_limit;
+float right_limit;
+float junk;
+TBOX blob_box;
+/* Find baseline of centre of blob */
+blob_box = blob->bounding_box();
+blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
+baseline = row->baseline.y(blob_x_centre);
+/*
+Find LH limit of blob ABOVE the xht. This is so that we can detect certain
+caps ht chars which should NOT have their box reduced: T, Y, V, W etc
+*/
+left_limit = static_cast<float>(INT32_MAX);
+junk = static_cast<float>(-INT32_MAX);
+find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
+left_limit, junk);
+if (left_limit > junk) {
+*left_above_xht = INT16_MAX; // No area above xht
+} else {
+*left_above_xht = static_cast<int16_t>(std::floor(left_limit));
+}
+/*
+Find reduced LH limit of blob - the left extent of the region ABOVE the
+baseline.
+*/
+left_limit = static_cast<float>(INT32_MAX);
+junk = static_cast<float>(-INT32_MAX);
+find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);
+if (left_limit > junk) {
+return TBOX(); // no area within xht so return empty box
+}
+/*
+Find reduced RH limit of blob - the right extent of the region BELOW the xht.
+*/
+junk = static_cast<float>(INT32_MAX);
+right_limit = static_cast<float>(-INT32_MAX);
+find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
+right_limit);
+if (junk > right_limit) {
+return TBOX(); // no area within xht so return empty box
+}
+return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
+ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp @ 2:b50eed0cc0ef upstream