Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1768 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/********************************************************************** + * tospace.cpp + * + * Compute fuzzy word spacing thresholds for each row. + * I.e. set : max_nonspace + * space_threshold + * min_space + * kern_size + * space_size + * for each row. + * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE + * + * Note: functions in this file were originally not members of any + * class or enclosed by any namespace. Now they are all static members + * of the Textord class. + * + **********************************************************************/ + +#include "drawtord.h" +#include "statistc.h" +#include "textord.h" +#include "tovars.h" + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include <algorithm> +#include <cmath> +#include <memory> + +#define MAXSPACING 128 /*max expected spacing in pix */ + +namespace tesseract { +void Textord::to_spacing(ICOORD page_tr, // topright of page + TO_BLOCK_LIST *blocks // blocks on page +) { + TO_BLOCK_IT block_it; // iterator + TO_BLOCK *block; // current block; + TO_ROW *row; // current row + int block_index; // block number + int row_index; // row number + // estimated width of real spaces for whole block + int16_t block_space_gap_width; + // estimated width of non space gaps for whole block + int16_t block_non_space_gap_width; + bool old_text_ord_proportional; // old fixed/prop result + + block_it.set_to_list(blocks); + block_index = 1; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk + block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width, + block_non_space_gap_width); + // Make sure relative values of block-level space and non-space gap + // widths are reasonable. The ratio of 1:3 is also used in + // block_spacing_stats, to correct the block_space_gap_width. + // Useful for arabic and hindi, when the non-space gap width is + // often over-estimated and should not be trusted. A similar ratio + // is found in block_spacing_stats. + if (tosp_old_to_method && tosp_old_to_constrain_sp_kn && + block_non_space_gap_width > block_space_gap_width / 3) { + block_non_space_gap_width = block_space_gap_width / 3; + } + // row iterator + TO_ROW_IT row_it(block->get_rows()); + row_index = 1; + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) { + if ((tosp_debug_level > 0) && !old_text_ord_proportional) { + tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index); + } + row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width, + block_non_space_gap_width); + } else { + if ((tosp_debug_level > 0) && old_text_ord_proportional) { + tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index, + row_index, row->pitch_decision, row->fixed_pitch); + } + } +#ifndef GRAPHICS_DISABLED + if (textord_show_initial_words) { + plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row); + } +#endif + row_index++; + } + block_index++; + } +} + +/************************************************************************* + * block_spacing_stats() + *************************************************************************/ + +void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional, + int16_t &block_space_gap_width, // resulting estimate + int16_t &block_non_space_gap_width // resulting estimate +) { + TO_ROW *row; // current row + BLOBNBOX_IT blob_it; // iterator + + STATS centre_to_centre_stats(0, MAXSPACING - 1); + // DEBUG USE ONLY + STATS all_gap_stats(0, MAXSPACING - 1); + STATS space_gap_stats(0, MAXSPACING - 1); + int16_t minwidth = MAXSPACING; // narrowest blob + TBOX blob_box; + TBOX prev_blob_box; + int16_t centre_to_centre; + int16_t gap_width; + float real_space_threshold; + float iqr_centre_to_centre; // DEBUG USE ONLY + float iqr_all_gap_stats; // DEBUG USE ONLY + int32_t end_of_row; + int32_t row_length; + + // row iterator + TO_ROW_IT row_it(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if (!row->blob_list()->empty() && + (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) || + (row->pitch_decision == PITCH_CORR_PROP))) { + blob_it.set_to_list(row->blob_list()); + blob_it.mark_cycle_pt(); + end_of_row = blob_it.data_relative(-1)->bounding_box().right(); + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + row_length = end_of_row - blob_box.left(); + if (blob_box.width() < minwidth) { + minwidth = blob_box.width(); + } + prev_blob_box = blob_box; + while (!blob_it.cycled_list()) { + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + if (blob_box.width() < minwidth) { + minwidth = blob_box.width(); + } + int16_t left = prev_blob_box.right(); + int16_t right = blob_box.left(); + gap_width = right - left; + if (!ignore_big_gap(row, row_length, gapmap, left, right)) { + all_gap_stats.add(gap_width, 1); + + centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2; + // DEBUG + centre_to_centre_stats.add(centre_to_centre, 1); + // DEBUG + } + prev_blob_box = blob_box; + } + } + } + + // Inadequate samples + if (all_gap_stats.get_total() <= 1) { + block_non_space_gap_width = minwidth; + block_space_gap_width = -1; // No est. space width + // DEBUG + old_text_ord_proportional = true; + } else { + /* For debug only ..... */ + iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25); + iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25); + old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats; + /* .......For debug only */ + + /* +The median of the gaps is used as an estimate of the NON-SPACE gap width. +This RELIES on the assumption that there are more gaps WITHIN words than +BETWEEN words in a block + +Now try to estimate the width of a real space for all real spaces in the +block. Do this by using a crude threshold to ignore "narrow" gaps, then +find the median of the "wide" gaps and use this. +*/ + block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median())); + // median gap + + row_it.set_to_list(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if (!row->blob_list()->empty() && + (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) || + (row->pitch_decision == PITCH_CORR_PROP))) { + real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width, + tosp_init_guess_xht_mult * row->xheight); + blob_it.set_to_list(row->blob_list()); + blob_it.mark_cycle_pt(); + end_of_row = blob_it.data_relative(-1)->bounding_box().right(); + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + row_length = blob_box.left() - end_of_row; + prev_blob_box = blob_box; + while (!blob_it.cycled_list()) { + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + int16_t left = prev_blob_box.right(); + int16_t right = blob_box.left(); + gap_width = right - left; + if ((gap_width > real_space_threshold) && + !ignore_big_gap(row, row_length, gapmap, left, right)) { + /* +If tosp_use_cert_spaces is enabled, the estimate of the space gap is +restricted to obvious spaces - those wider than half the xht or +those with wide blobs on both sides - i.e not things that are +suspect 1's or punctuation that is sometimes widely spaced. +*/ + if (!tosp_block_use_cert_spaces || + (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || + ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && + (!tosp_narrow_blobs_not_cert || + (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || + (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { + space_gap_stats.add(gap_width, 1); + } + } + prev_blob_box = blob_box; + } + } + } + // Inadequate samples + if (space_gap_stats.get_total() <= 2) { + block_space_gap_width = -1; // No est. space width + } else { + block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())), + static_cast<int16_t>(3 * block_non_space_gap_width)); + } + } +} + +/************************************************************************* + * row_spacing_stats() + * Set values for min_space, max_non_space based on row stats only + * If failure - return 0 values. + *************************************************************************/ +void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx, + int16_t block_space_gap_width, // estimate for block + int16_t block_non_space_gap_width // estimate for block +) { + // iterator + BLOBNBOX_IT blob_it = row->blob_list(); + STATS all_gap_stats(0, MAXSPACING - 1); + STATS cert_space_gap_stats(0, MAXSPACING - 1); + STATS all_space_gap_stats(0, MAXSPACING - 1); + STATS small_gap_stats(0, MAXSPACING - 1); + TBOX blob_box; + TBOX prev_blob_box; + int16_t gap_width; + int16_t real_space_threshold = 0; + int16_t max = 0; + int16_t large_gap_count = 0; + bool suspected_table; + bool good_block_space_estimate = block_space_gap_width > 0; + int32_t end_of_row; + int32_t row_length = 0; + float sane_space; + int32_t sane_threshold; + + /* Collect first pass stats for row */ + + if (!good_block_space_estimate) { + block_space_gap_width = int16_t(std::floor(row->xheight / 2)); + } + if (!row->blob_list()->empty()) { + if (tosp_threshold_bias1 > 0) { + real_space_threshold = + block_non_space_gap_width + + int16_t(floor(0.5 + tosp_threshold_bias1 * + (block_space_gap_width - block_non_space_gap_width))); + } else { + real_space_threshold = // Old TO method + (block_space_gap_width + block_non_space_gap_width) / 2; + } + blob_it.set_to_list(row->blob_list()); + blob_it.mark_cycle_pt(); + end_of_row = blob_it.data_relative(-1)->bounding_box().right(); + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + row_length = end_of_row - blob_box.left(); + prev_blob_box = blob_box; + while (!blob_it.cycled_list()) { + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + int16_t left = prev_blob_box.right(); + int16_t right = blob_box.left(); + gap_width = right - left; + if (ignore_big_gap(row, row_length, gapmap, left, right)) { + large_gap_count++; + } else { + if (gap_width >= real_space_threshold) { + if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || + ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && + (!tosp_narrow_blobs_not_cert || + (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || + (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { + cert_space_gap_stats.add(gap_width, 1); + } + all_space_gap_stats.add(gap_width, 1); + } else { + small_gap_stats.add(gap_width, 1); + } + all_gap_stats.add(gap_width, 1); + } + prev_blob_box = blob_box; + } + } + suspected_table = (large_gap_count > 1) || + ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples)); + + /* Now determine row kern size, space size and threshold */ + + if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) || + ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) && + cert_space_gap_stats.get_total() > 0)) { + old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats, + block_space_gap_width, block_non_space_gap_width); + } else { + if (!tosp_recovery_isolated_row_stats || + !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) { + if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) { + tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx); + } + if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { + // Use block default + row->space_size = block_space_gap_width; + if (all_gap_stats.get_total() > tosp_redo_kern_limit) { + row->kern_size = all_gap_stats.median(); + } else { + row->kern_size = block_non_space_gap_width; + } + row->space_threshold = + int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); + } else { + old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats, + block_space_gap_width, block_non_space_gap_width); + } + } + } + + if (tosp_improve_thresh && !suspected_table) { + improve_row_threshold(row, &all_gap_stats); + } + + /* Now lets try to be careful not to do anything silly with tables when we +are ignoring big gaps*/ + if (tosp_sanity_method == 0) { + if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx, + row->kern_size, row->space_threshold, row->space_size); + } + row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size); + row->space_size = std::max(row->space_threshold + 1.0f, row->xheight); + } + } else if (tosp_sanity_method == 1) { + sane_space = row->space_size; + /* NEVER let space size get too close to kern size */ + if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) || + ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) { + if (good_block_space_estimate && + (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) { + sane_space = block_space_gap_width; + } else { + sane_space = + std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f), + row->xheight / 2.0f); + } + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx, + row->kern_size, row->space_threshold, row->space_size, sane_space); + } + row->space_size = sane_space; + row->space_threshold = + int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); + } + /* NEVER let threshold get VERY far away from kern */ + sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f))); + if (row->space_threshold > sane_threshold) { + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx, + row->kern_size, row->space_threshold, row->space_size, sane_threshold); + } + row->space_threshold = sane_threshold; + if (row->space_size <= sane_threshold) { + row->space_size = row->space_threshold + 1.0f; + } + } + /* Beware of tables - there may be NO spaces */ + if (suspected_table) { + sane_space = + std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight); + sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2)); + + if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) { + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx, + row->kern_size, row->space_threshold, row->space_size); + } + // the minimum sane value + row->space_threshold = static_cast<int32_t>(sane_space); + row->space_size = std::max(row->space_threshold + 1.0f, row->xheight); + } + } + } + + /* Now lets try to put some error limits on the threshold */ + + if (tosp_old_to_method) { + /* Old textord made a space if gap >= threshold */ + // NO FUZZY SPACES YET + row->max_nonspace = row->space_threshold; + // NO FUZZY SPACES YET + row->min_space = row->space_threshold + 1; + } else { + /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */ + row->min_space = + std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size)); + if (row->min_space <= row->space_threshold) { + // Don't be silly + row->min_space = row->space_threshold + 1; + } + /* +Lets try to guess the max certain kern gap by looking at the cluster of +kerns for the row. The row is proportional so the kerns should cluster +tightly at the bottom of the distribution. We also expect most gaps to be +kerns. Find the maximum of the kern piles between 0 and twice the kern +estimate. Piles before the first one with less than 1/10 the maximum +number of samples can be taken as certain kerns. + + Of course, there are some cases where the kern peak and space peaks merge, + so we will put an UPPER limit on the max certain kern gap of some fraction + below the threshold. +*/ + + // upper bound + int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2); + + // default + row->max_nonspace = max_max_nonspace; + for (int32_t index = 0; index <= max_max_nonspace; index++) { + if (all_gap_stats.pile_count(index) > max) { + max = all_gap_stats.pile_count(index); + } + if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) { + row->max_nonspace = index; + break; + } + } + } + + /* Yet another algorithm - simpler this time - just choose a fraction of the +threshold to space range */ + + if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) { + row->min_space = std::max( + row->min_space, static_cast<int32_t>(ceil(row->space_threshold + + tosp_fuzzy_sp_fraction * + (row->space_size - row->space_threshold)))); + } + + /* Ensure that ANY space less than some multiplier times the kern size is +fuzzy. In tables there is a risk of erroneously setting a small space size +when there are no real spaces. Sometimes tables have text squashed into +columns so that the kn->sp ratio is small anyway - this means that we can't +use this to force a wider separation - hence we rely on context to join any +dubious breaks. */ + + if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) { + row->min_space = std::max( + row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size))); + } + + if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) { + row->max_nonspace = static_cast<int32_t>(floor( + 0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size))); + } + if (row->max_nonspace > row->space_threshold) { + // Don't be silly + row->max_nonspace = row->space_threshold; + } + + if (tosp_debug_level > 5) { + tprintf( + "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) " + "Sp:%3.2f\n", + block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width, + real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold, + row->min_space, row->space_size); + } + if (tosp_debug_level > 10) { + tprintf( + "row->kern_size = %3.2f, row->space_size = %3.2f, " + "row->space_threshold = %d\n", + row->kern_size, row->space_size, row->space_threshold); + } +} + +void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats, + STATS *small_gap_stats, + int16_t block_space_gap_width, // estimate for block + int16_t block_non_space_gap_width // estimate for block +) { + /* First, estimate row space size */ + /* Old to condition was > 2 */ + if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) { + // Adequate samples + /* Set space size to median of spaces BUT limits it if it seems wildly out + */ + row->space_size = space_gap_stats->median(); + if (row->space_size > block_space_gap_width * 1.5) { + if (tosp_old_to_bug_fix) { + row->space_size = block_space_gap_width * 1.5; + } else { + // BUG??? should be *1.5 + row->space_size = block_space_gap_width; + } + } + if (row->space_size < (block_non_space_gap_width * 2) + 1) { + row->space_size = (block_non_space_gap_width * 2) + 1; + } + } + // Only 1 or 2 samples + else if (space_gap_stats->get_total() >= 1) { + // hence mean not median + row->space_size = space_gap_stats->mean(); + if (row->space_size > block_space_gap_width * 1.5) { + if (tosp_old_to_bug_fix) { + row->space_size = block_space_gap_width * 1.5; + } else { + // BUG??? should be *1.5 + row->space_size = block_space_gap_width; + } + } + if (row->space_size < (block_non_space_gap_width * 3) + 1) { + row->space_size = (block_non_space_gap_width * 3) + 1; + } + } else { + // Use block default + row->space_size = block_space_gap_width; + } + + /* Next, estimate row kern size */ + if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) { + row->kern_size = small_gap_stats->median(); + } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) { + row->kern_size = all_gap_stats->median(); + } else { // old TO -SAME FOR ALL ROWS + row->kern_size = block_non_space_gap_width; + } + + /* Finally, estimate row space threshold */ + if (tosp_threshold_bias2 > 0) { + row->space_threshold = int32_t( + floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size))); + } else { + /* + NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold +and holds this in a float. The use is with a >= test +NEW textord uses an integer threshold and a > test +It comes to the same thing. + (Though there is a difference in that old textor has integer space_size + and kern_size.) +*/ + row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2)); + } + + // Apply the same logic and ratios as in row_spacing_stats to + // restrict relative values of the row's space_size, kern_size, and + // space_threshold + if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 && + ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) || + ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) { + if (row->kern_size > 2.5) { + row->kern_size = row->space_size / tosp_min_sane_kn_sp; + } + row->space_threshold = + int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); + } +} + +/************************************************************************* + * isolated_row_stats() + * Set values for min_space, max_non_space based on row stats only + *************************************************************************/ +bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, + bool suspected_table, int16_t block_idx, int16_t row_idx) { + float kern_estimate; + float crude_threshold_estimate; + int16_t small_gaps_count; + int16_t total; + // iterator + BLOBNBOX_IT blob_it = row->blob_list(); + STATS cert_space_gap_stats(0, MAXSPACING - 1); + STATS all_space_gap_stats(0, MAXSPACING - 1); + STATS small_gap_stats(0, MAXSPACING - 1); + TBOX blob_box; + TBOX prev_blob_box; + int16_t gap_width; + int32_t end_of_row; + int32_t row_length; + + kern_estimate = all_gap_stats->median(); + crude_threshold_estimate = + std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight); + small_gaps_count = + stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate))); + total = all_gap_stats->get_total(); + + if ((total <= tosp_redo_kern_limit) || + ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) || + (total - small_gaps_count < 1)) { + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx); + } + return false; + } + blob_it.set_to_list(row->blob_list()); + blob_it.mark_cycle_pt(); + end_of_row = blob_it.data_relative(-1)->bounding_box().right(); + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + row_length = end_of_row - blob_box.left(); + prev_blob_box = blob_box; + while (!blob_it.cycled_list()) { + if (tosp_use_pre_chopping) { + blob_box = box_next_pre_chopped(&blob_it); + } else if (tosp_stats_use_xht_gaps) { + blob_box = reduced_box_next(row, &blob_it); + } else { + blob_box = box_next(&blob_it); + } + int16_t left = prev_blob_box.right(); + int16_t right = blob_box.left(); + gap_width = right - left; + if (!ignore_big_gap(row, row_length, gapmap, left, right) && + (gap_width > crude_threshold_estimate)) { + if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) || + ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && + (!tosp_narrow_blobs_not_cert || + (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || + (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { + cert_space_gap_stats.add(gap_width, 1); + } + all_space_gap_stats.add(gap_width, 1); + } + if (gap_width < crude_threshold_estimate) { + small_gap_stats.add(gap_width, 1); + } + + prev_blob_box = blob_box; + } + if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { + // median + row->space_size = cert_space_gap_stats.median(); + } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) { + // to avoid spaced + row->space_size = cert_space_gap_stats.mean(); + // 1's in tables + } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { + // median + row->space_size = all_space_gap_stats.median(); + } else { + row->space_size = all_space_gap_stats.mean(); + } + + if (tosp_only_small_gaps_for_kern) { + row->kern_size = small_gap_stats.median(); + } else { + row->kern_size = all_gap_stats->median(); + } + row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2)); + /* Sanity check */ + if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) || + (row->space_threshold <= 0)) { + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx, + row->kern_size, row->space_threshold, row->space_size); + } + row->kern_size = 0.0f; + row->space_threshold = 0; + row->space_size = 0.0f; + return false; + } + + if (tosp_debug_level > 5) { + tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size, + row->space_threshold, row->space_size); + } + return true; +} + +int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) { + int16_t index; + int16_t total = 0; + + for (index = 0; index < threshold; index++) { + total += stats->pile_count(index); + } + return total; +} + +/************************************************************************* + * improve_row_threshold() + * Try to recognise a "normal line" - + * > 25 gaps + * && space > 3 * kn && space > 10 + * (I.e. reasonably large space and kn:sp ratio) + * && > 3/4 # gaps < kn + (sp - kn)/3 + * (I.e. most gaps are well away from space estimate) + * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found + * somewhere in the histogram between kn and sp + * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies + * NO!!!!! the bristol line has "11" with a gap of 12 between the + *1's!!! try moving the default threshold to within this band but leave the + * fuzzy limit calculation as at present. + *************************************************************************/ +void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { + float sp = row->space_size; + float kn = row->kern_size; + int16_t reqd_zero_width = 0; + int16_t zero_width = 0; + int16_t zero_start = 0; + int16_t index = 0; + + if (tosp_debug_level > 10) { + tprintf("Improve row threshold 0"); + } + if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) || + (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) < + (0.75 * all_gap_stats->get_total()))) { + return; + } + if (tosp_debug_level > 10) { + tprintf(" 1"); + } + /* +Look for the first region of all 0's in the histogram which is wider than +max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current +threshold is not within it, move the threshold so that is just inside it. +*/ + reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5)); + if (reqd_zero_width < 3) { + reqd_zero_width = 3; + } + + for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) { + if (all_gap_stats->pile_count(index) == 0) { + if (zero_width == 0) { + zero_start = index; + } + zero_width++; + } else { + if (zero_width >= reqd_zero_width) { + break; + } else { + zero_width = 0; + } + } + } + index--; + if (tosp_debug_level > 10) { + tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width, + zero_width, zero_start, row->space_threshold); + } + if ((zero_width < reqd_zero_width) || + ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) { + return; + } + if (tosp_debug_level > 10) { + tprintf(" 2"); + } + if (row->space_threshold < zero_start) { + if (tosp_debug_level > 5) { + tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, + index, row->space_threshold, zero_start); + } + row->space_threshold = zero_start; + } + if (row->space_threshold > index) { + if (tosp_debug_level > 5) { + tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, + index, row->space_threshold, index); + } + row->space_threshold = index; + } +} + +/********************************************************************** + * make_prop_words + * + * Convert a TO_ROW to a ROW. + **********************************************************************/ +ROW *Textord::make_prop_words(TO_ROW *row, // row to make + FCOORD rotation // for drawing +) { + bool bol; // start of line + /* prev_ values are for start of word being built. non prev_ values are for +the gap between the word being built and the next one. */ + bool prev_fuzzy_sp; // probably space + bool prev_fuzzy_non; // probably not + uint8_t prev_blanks; // in front of word + bool fuzzy_sp = false; // probably space + bool fuzzy_non = false; // probably not + uint8_t blanks = 0; // in front of word + bool prev_gap_was_a_space = false; + bool break_at_next_gap = false; + ROW *real_row; // output row + C_OUTLINE_IT cout_it; + C_BLOB_LIST cblobs; + C_BLOB_IT cblob_it = &cblobs; + WERD_LIST words; + WERD *word; // new word + int32_t next_rep_char_word_right = INT32_MAX; + float repetition_spacing; // gap between repetitions + int32_t xstarts[2]; // row ends + int32_t prev_x; // end of prev blob + BLOBNBOX_IT box_it; // iterator + TBOX prev_blob_box; + TBOX next_blob_box; + int16_t prev_gap = INT16_MAX; + int16_t current_gap = INT16_MAX; + int16_t next_gap = INT16_MAX; + int16_t prev_within_xht_gap = INT16_MAX; + int16_t current_within_xht_gap = INT16_MAX; + int16_t next_within_xht_gap = INT16_MAX; + int16_t word_count = 0; + + // repeated char words + WERD_IT rep_char_it(&(row->rep_words)); + if (!rep_char_it.empty()) { + next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); + } + + prev_x = -INT16_MAX; + cblob_it.set_to_list(&cblobs); + box_it.set_to_list(row->blob_list()); + // new words + WERD_IT word_it(&words); + bol = true; + prev_blanks = 0; + prev_fuzzy_sp = false; + prev_fuzzy_non = false; + if (!box_it.empty()) { + xstarts[0] = box_it.data()->bounding_box().left(); + if (xstarts[0] > next_rep_char_word_right) { + /* We need to insert a repeated char word at the start of the row */ + word = rep_char_it.extract(); + word_it.add_after_then_move(word); + /* Set spaces before repeated char word */ + word->set_flag(W_BOL, true); + bol = false; + word->set_blanks(0); + // NO uncertainty + word->set_flag(W_FUZZY_SP, false); + word->set_flag(W_FUZZY_NON, false); + xstarts[0] = word->bounding_box().left(); + /* Set spaces after repeated char word (and leave current word set) */ + repetition_spacing = find_mean_blob_spacing(word); + current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right; + current_within_xht_gap = current_gap; + if (current_gap > tosp_rep_space * repetition_spacing) { + prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); + if (prev_blanks < 1) { + prev_blanks = 1; + } + } else { + prev_blanks = 0; + } + if (tosp_debug_level > 5) { + tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", + box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(), + repetition_spacing, current_gap); + } + prev_fuzzy_sp = false; + prev_fuzzy_non = false; + if (rep_char_it.empty()) { + next_rep_char_word_right = INT32_MAX; + } else { + rep_char_it.forward(); + next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); + } + } + + peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); + do { + auto bblob = box_it.data(); + auto blob_box = bblob->bounding_box(); + if (bblob->joined_to_prev()) { + auto cblob = bblob->remove_cblob(); + if (cblob != nullptr) { + cout_it.set_to_list(cblob_it.data()->out_list()); + cout_it.move_to_last(); + cout_it.add_list_after(cblob->out_list()); + delete cblob; + } + } else { + auto cblob = bblob->cblob(); + if (cblob != nullptr) { + bblob->set_owns_cblob(false); + cblob_it.add_after_then_move(cblob); + } + prev_x = blob_box.right(); + } + box_it.forward(); // next one + bblob = box_it.data(); + blob_box = bblob->bounding_box(); + + if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) { + /* Real Blob - not multiple outlines or pre-chopped */ + prev_gap = current_gap; + prev_within_xht_gap = current_within_xht_gap; + prev_blob_box = next_blob_box; + current_gap = next_gap; + current_within_xht_gap = next_within_xht_gap; + peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); + + int16_t prev_gap_arg = prev_gap; + int16_t next_gap_arg = next_gap; + if (tosp_only_use_xht_gaps) { + prev_gap_arg = prev_within_xht_gap; + next_gap_arg = next_within_xht_gap; + } + // Decide if a word-break should be inserted + if (blob_box.left() > next_rep_char_word_right || + make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap, + current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp, + fuzzy_non, prev_gap_was_a_space, break_at_next_gap) || + box_it.at_first()) { + /* Form a new word out of the blobs collected */ + word = new WERD(&cblobs, prev_blanks, nullptr); + word_count++; + word_it.add_after_then_move(word); + if (bol) { + word->set_flag(W_BOL, true); + bol = false; + } + if (prev_fuzzy_sp) { + // probably space + word->set_flag(W_FUZZY_SP, true); + } else if (prev_fuzzy_non) { + word->set_flag(W_FUZZY_NON, true); + } + // probably not + + if (blob_box.left() > next_rep_char_word_right) { + /* We need to insert a repeated char word */ + word = rep_char_it.extract(); + word_it.add_after_then_move(word); + + /* Set spaces before repeated char word */ + repetition_spacing = find_mean_blob_spacing(word); + current_gap = word->bounding_box().left() - prev_x; + current_within_xht_gap = current_gap; + if (current_gap > tosp_rep_space * repetition_spacing) { + blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); + if (blanks < 1) { + blanks = 1; + } + } else { + blanks = 0; + } + if (tosp_debug_level > 5) { + tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);", + word->bounding_box().left(), word->bounding_box().bottom(), + repetition_spacing, current_gap, blanks); + } + word->set_blanks(blanks); + // NO uncertainty + word->set_flag(W_FUZZY_SP, false); + word->set_flag(W_FUZZY_NON, false); + + /* Set spaces after repeated char word (and leave current word set) + */ + current_gap = blob_box.left() - next_rep_char_word_right; + if (current_gap > tosp_rep_space * repetition_spacing) { + blanks = static_cast<uint8_t>(current_gap / row->space_size); + if (blanks < 1) { + blanks = 1; + } + } else { + blanks = 0; + } + if (tosp_debug_level > 5) { + tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks); + } + fuzzy_sp = false; + fuzzy_non = false; + + if (rep_char_it.empty()) { + next_rep_char_word_right = INT32_MAX; + } else { + rep_char_it.forward(); + next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); + } + } + + if (box_it.at_first() && rep_char_it.empty()) { + // at end of line + word->set_flag(W_EOL, true); + xstarts[1] = prev_x; + } else { + prev_blanks = blanks; + prev_fuzzy_sp = fuzzy_sp; + prev_fuzzy_non = fuzzy_non; + } + } + } + } while (!box_it.at_first()); // until back at start + + /* Insert any further repeated char words */ + while (!rep_char_it.empty()) { + word = rep_char_it.extract(); + word_it.add_after_then_move(word); + + /* Set spaces before repeated char word */ + repetition_spacing = find_mean_blob_spacing(word); + current_gap = word->bounding_box().left() - prev_x; + if (current_gap > tosp_rep_space * repetition_spacing) { + blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); + if (blanks < 1) { + blanks = 1; + } + } else { + blanks = 0; + } + if (tosp_debug_level > 5) { + tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n", + word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing, + current_gap, blanks); + } + word->set_blanks(blanks); + // NO uncertainty + word->set_flag(W_FUZZY_SP, false); + word->set_flag(W_FUZZY_NON, false); + prev_x = word->bounding_box().right(); + if (rep_char_it.empty()) { + // at end of line + word->set_flag(W_EOL, true); + xstarts[1] = prev_x; + } else { + rep_char_it.forward(); + } + } + real_row = + new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); + word_it.set_to_list(real_row->word_list()); + // put words in row + word_it.add_list_after(&words); + real_row->recalc_bounding_box(); + + if (tosp_debug_level > 4) { + tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count, + real_row->bounding_box().left(), real_row->bounding_box().bottom(), + real_row->bounding_box().right(), real_row->bounding_box().top()); + } + return real_row; + } + return nullptr; +} + +/********************************************************************** + * make_blob_words + * + * Converts words into blobs so that each blob is a single character. + * Used for chopper test. + **********************************************************************/ +ROW *Textord::make_blob_words(TO_ROW *row, // row to make + FCOORD rotation // for drawing +) { + bool bol; // start of line + ROW *real_row; // output row + C_OUTLINE_IT cout_it; + C_BLOB_LIST cblobs; + C_BLOB_IT cblob_it = &cblobs; + WERD_LIST words; + WERD *word; // new word + BLOBNBOX_IT box_it; // iterator + int16_t word_count = 0; + + cblob_it.set_to_list(&cblobs); + box_it.set_to_list(row->blob_list()); + // new words + WERD_IT word_it(&words); + bol = true; + if (!box_it.empty()) { + do { + auto bblob = box_it.data(); + auto blob_box = bblob->bounding_box(); + if (bblob->joined_to_prev()) { + auto cblob = bblob->remove_cblob(); + if (cblob != nullptr) { + cout_it.set_to_list(cblob_it.data()->out_list()); + cout_it.move_to_last(); + cout_it.add_list_after(cblob->out_list()); + delete cblob; + } + } else { + auto cblob = bblob->cblob(); + if (cblob != nullptr) { + bblob->set_owns_cblob(false); + cblob_it.add_after_then_move(cblob); + } + } + box_it.forward(); // next one + bblob = box_it.data(); + blob_box = bblob->bounding_box(); + + if (!bblob->joined_to_prev() && !cblobs.empty()) { + word = new WERD(&cblobs, 1, nullptr); + word_count++; + word_it.add_after_then_move(word); + if (bol) { + word->set_flag(W_BOL, true); + bol = false; + } + if (box_it.at_first()) { // at end of line + word->set_flag(W_EOL, true); + } + } + } while (!box_it.at_first()); // until back at start + /* Setup the row with created words. */ + real_row = + new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); + word_it.set_to_list(real_row->word_list()); + // put words in row + word_it.add_list_after(&words); + real_row->recalc_bounding_box(); + if (tosp_debug_level > 4) { + tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count, + real_row->bounding_box().left(), real_row->bounding_box().bottom(), + real_row->bounding_box().right(), real_row->bounding_box().top()); + } + return real_row; + } + return nullptr; +} + +bool Textord::make_a_word_break(TO_ROW *row, // row being made + TBOX blob_box, // for next_blob // how many blanks? + int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap, + int16_t within_xht_current_gap, TBOX next_blob_box, + int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non, + bool &prev_gap_was_a_space, bool &break_at_next_gap) { + bool space; + int16_t current_gap; + float fuzzy_sp_to_kn_limit; + + if (break_at_next_gap) { + break_at_next_gap = false; + return true; + } + /* Inhibit using the reduced gap if + The kerning is large - chars are not kerned and reducing "f"s can cause + erroneous blanks +OR The real gap is less than 0 +OR The real gap is less than the kerning estimate +*/ + if ((row->kern_size > tosp_large_kerning * row->xheight) || + ((tosp_dont_fool_with_small_kerns >= 0) && + (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) { + // Ignore the difference + within_xht_current_gap = real_current_gap; + } + + if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) { + current_gap = within_xht_current_gap; + } else { + current_gap = real_current_gap; + } + + if (tosp_old_to_method) { + // Boring old method + space = current_gap > row->max_nonspace; + if (space && (current_gap < INT16_MAX)) { + if (current_gap < row->min_space) { + if (current_gap > row->space_threshold) { + blanks = 1; + fuzzy_sp = true; + fuzzy_non = false; + } else { + blanks = 0; + fuzzy_sp = false; + fuzzy_non = true; + } + } else { + if (row->space_size == 0.0f) { + // Avoid FP division by 0. + blanks = 1; + } else { + blanks = static_cast<uint8_t>(current_gap / row->space_size); + if (blanks < 1) { + blanks = 1; + } + } + fuzzy_sp = false; + fuzzy_non = false; + } + } + return space; + } else { + /* New exciting heuristic method */ + if (prev_blob_box.null_box()) { // Beginning of row + prev_gap_was_a_space = true; + } + + // Default as old TO + space = current_gap > row->space_threshold; + + /* Set defaults for the word break in case we find one. Currently there are +no fuzzy spaces. Depending on the reliability of the different heuristics +we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY +be used if the function returns true - ie the word is to be broken. +*/ + int num_blanks = current_gap; + if (row->space_size > 1.0f) { + num_blanks = IntCastRounded(current_gap / row->space_size); + } + blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX)); + fuzzy_sp = false; + fuzzy_non = false; + /* +If xht measure causes gap to flip one of the 3 thresholds act accordingly - +despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to +context. +*/ + if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) && + (within_xht_current_gap > row->max_nonspace)) { + space = true; + fuzzy_non = true; +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) && + (within_xht_current_gap > row->space_threshold)) { + space = true; + if (tosp_flip_fuzz_kn_to_sp) { + fuzzy_sp = true; + } else { + fuzzy_non = true; + } +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) && + (within_xht_current_gap >= row->min_space)) { + space = true; +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) && + suspected_punct_blob(row, blob_box)) { + break_at_next_gap = true; + } + /* Now continue with normal heuristics */ + else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) { + /* Heuristics to turn dubious spaces to kerns */ + if (tosp_pass_wide_fuzz_sp_to_context > 0) { + fuzzy_sp_to_kn_limit = + row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size); + } else { + fuzzy_sp_to_kn_limit = 99999.0f; + } + + /* If current gap is significantly smaller than the previous space the +other side of a narrow blob then this gap is a kern. */ + if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space && + (current_gap <= tosp_gap_factor * prev_gap)) { + if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { + if (tosp_flip_fuzz_sp_to_kn) { + fuzzy_non = true; + } else { + fuzzy_sp = true; + } + } else { + space = false; + } +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } + /* If current gap not much bigger than the previous kern the other side of +a narrow blob then this gap is a kern as well */ + else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && + !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) { + if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { + if (tosp_flip_fuzz_sp_to_kn) { + fuzzy_non = true; + } else { + fuzzy_sp = true; + } + } else { + space = false; + } +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) && + (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) { + if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { + if (tosp_flip_fuzz_sp_to_kn) { + fuzzy_non = true; + } else { + fuzzy_sp = true; + } + } else { + space = false; + } +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) && + (next_gap <= row->space_threshold) && + (current_gap * tosp_gap_factor <= next_gap)) { + if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { + if (tosp_flip_fuzz_sp_to_kn) { + fuzzy_non = true; + } else { + fuzzy_sp = true; + } + } else { + space = false; + } +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) || + ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) { + fuzzy_sp = true; +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } + } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) { + /* Heuristics to turn dubious kerns to spaces */ + /* TRIED THIS BUT IT MADE THINGS WORSE + if (prev_gap == INT16_MAX) + prev_gap = 0; // start of row + if (next_gap == INT16_MAX) + next_gap = 0; // end of row +*/ + if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) && + (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) && + wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) { + space = true; + /* +tosp_flip_caution is an attempt to stop the default changing in cases +where there is a large difference between the kern and space estimates. + See problem in 'chiefs' where "have" gets split in the quotation. +*/ + if ((tosp_flip_fuzz_kn_to_sp) && + ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) { + fuzzy_sp = true; + } else { + fuzzy_non = true; + } +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 && + current_gap > 5 && // Rule 9 handles small gap, big ratio. + current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) && + !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) && + !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) { + space = true; + fuzzy_non = true; +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) && + (next_blob_box.width() > 0) && + (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) && + (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) && + !suspected_punct_blob(row, next_blob_box)))) { + space = true; + fuzzy_non = true; +#ifndef GRAPHICS_DISABLED + mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), + next_gap); +#endif + } + } + if (tosp_debug_level > 10) { + tprintf( + "word break = %d current_gap = %d, prev_gap = %d, " + "next_gap = %d\n", + space ? 1 : 0, current_gap, prev_gap, next_gap); + } + prev_gap_was_a_space = space && !(fuzzy_non); + return space; + } +} + +bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) { + bool result; + result = + ((blob_box.width() <= tosp_narrow_fraction * row->xheight) || + ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio)); + return result; +} + +bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) { + bool result; + if (tosp_wide_fraction > 0) { + if (tosp_wide_aspect_ratio > 0) { + result = + ((blob_box.width() >= tosp_wide_fraction * row->xheight) && + ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio)); + } else { + result = (blob_box.width() >= tosp_wide_fraction * row->xheight); + } + } else { + result = !narrow_blob(row, blob_box); + } + return result; +} + +bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) { + bool result; + float baseline; + float blob_x_centre; + /* Find baseline of centre of blob */ + blob_x_centre = (box.right() + box.left()) / 2.0; + baseline = row->baseline.y(blob_x_centre); + + result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) || + (box.bottom() > baseline + row->xheight / 2.0); + return result; +} + +void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box, + int16_t &next_gap, int16_t &next_within_xht_gap) { + TBOX next_reduced_blob_box; + TBOX bit_beyond; + BLOBNBOX_IT reduced_box_it = box_it; + + next_blob_box = box_next(&box_it); + next_reduced_blob_box = reduced_box_next(row, &reduced_box_it); + if (box_it.at_first()) { + next_gap = INT16_MAX; + next_within_xht_gap = INT16_MAX; + } else { + bit_beyond = box_it.data()->bounding_box(); + next_gap = bit_beyond.left() - next_blob_box.right(); + bit_beyond = reduced_box_next(row, &reduced_box_it); + next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right(); + } +} + +#ifndef GRAPHICS_DISABLED +void Textord::mark_gap(TBOX blob, // blob following gap + int16_t rule, // heuristic id + int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap, + int16_t next_blob_width, int16_t next_gap) { + ScrollView::Color col; // of ellipse marking flipped gap + + switch (rule) { + case 1: + col = ScrollView::RED; + break; + case 2: + col = ScrollView::CYAN; + break; + case 3: + col = ScrollView::GREEN; + break; + case 4: + col = ScrollView::BLACK; + break; + case 5: + col = ScrollView::MAGENTA; + break; + case 6: + col = ScrollView::BLUE; + break; + + case 7: + col = ScrollView::WHITE; + break; + case 8: + col = ScrollView::YELLOW; + break; + case 9: + col = ScrollView::BLACK; + break; + + case 20: + col = ScrollView::CYAN; + break; + case 21: + col = ScrollView::GREEN; + break; + case 22: + col = ScrollView::MAGENTA; + break; + default: + col = ScrollView::BLACK; + } + if (textord_show_initial_words) { + to_win->Pen(col); + /* if (rule < 20) + //interior_style(to_win, INT_SOLID, false); + else + //interior_style(to_win, INT_HOLLOW, true);*/ + // x radius + to_win->Ellipse(current_gap / 2.0f, + blob.height() / 2.0f, // y radius + // x centre + blob.left() - current_gap / 2.0f, + // y centre + blob.bottom() + blob.height() / 2.0f); + } + if (tosp_debug_level > 5) { + tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2, + blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap); + } +} +#endif + +float Textord::find_mean_blob_spacing(WERD *word) { + C_BLOB_IT cblob_it; + TBOX blob_box; + int32_t gap_sum = 0; + int16_t gap_count = 0; + int16_t prev_right; + + cblob_it.set_to_list(word->cblob_list()); + if (!cblob_it.empty()) { + cblob_it.mark_cycle_pt(); + prev_right = cblob_it.data()->bounding_box().right(); + // first blob + cblob_it.forward(); + for (; !cblob_it.cycled_list(); cblob_it.forward()) { + blob_box = cblob_it.data()->bounding_box(); + gap_sum += blob_box.left() - prev_right; + gap_count++; + prev_right = blob_box.right(); + } + } + if (gap_count > 0) { + return (gap_sum / static_cast<float>(gap_count)); + } else { + return 0.0f; + } +} + +bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left, + int16_t right) { + int16_t gap = right - left + 1; + + if (tosp_ignore_big_gaps > 999) { + return false; // Don't ignore + } + if (tosp_ignore_big_gaps > 0) { + return (gap > tosp_ignore_big_gaps * row->xheight); + } + if (gap > tosp_ignore_very_big_gaps * row->xheight) { + return true; + } + if (tosp_ignore_big_gaps == 0) { + if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) { + return true; + } + if ((gap > 1.75 * row->xheight) && + ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) { + return true; + } + } else { + /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table + */ + if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) { + return true; + } + } + return false; +} + +/********************************************************************** + * reduced_box_next + * + * Compute the bounding box of this blob with merging of x overlaps + * but no pre-chopping. + * Then move the iterator on to the start of the next blob. + * DON'T reduce the box for small things - eg punctuation. + **********************************************************************/ +TBOX Textord::reduced_box_next(TO_ROW *row, // current row + BLOBNBOX_IT *it // iterator to blobds +) { + BLOBNBOX *blob; // current blob + BLOBNBOX *head_blob; // place to store box + TBOX full_box; // full blob boundg box + TBOX reduced_box; // box of significant part + int16_t left_above_xht; // ABOVE xht left limit + int16_t new_left_above_xht; // ABOVE xht left limit + + blob = it->data(); + if (blob->red_box_set()) { + reduced_box = blob->reduced_box(); + do { + it->forward(); + blob = it->data(); + } while (blob->cblob() == nullptr || blob->joined_to_prev()); + return reduced_box; + } + head_blob = blob; + full_box = blob->bounding_box(); + reduced_box = reduced_box_for_blob(blob, row, &left_above_xht); + do { + it->forward(); + blob = it->data(); + if (blob->cblob() == nullptr) { + // was pre-chopped + full_box += blob->bounding_box(); + } else if (blob->joined_to_prev()) { + reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht); + left_above_xht = std::min(left_above_xht, new_left_above_xht); + } + } + // until next real blob + while (blob->cblob() == nullptr || blob->joined_to_prev()); + + if ((reduced_box.width() > 0) && + ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) && + (reduced_box.height() > 0.7 * row->xheight)) { +#ifndef GRAPHICS_DISABLED + if (textord_show_initial_words) { + reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW); + } +#endif + } else { + reduced_box = full_box; + } + head_blob->set_reduced_box(reduced_box); + return reduced_box; +} + +/************************************************************************* + * reduced_box_for_blob() + * Find box for blob which is the same height and y position as the whole blob, + * but whose left limit is the left most position of the blob ABOVE the + * baseline and whose right limit is the right most position of the blob BELOW + * the xheight. + * + * + * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on + * "home". Perhaps we need something which say if the width ABOVE the + * xht alone includes the whole of the reduced width, then use the full + * blob box - Might still fail on italic F + * + * Alternatively we could be a little less severe and only reduce the + * left and right edges by half the difference between the full box and + * the reduced box. + * + * NOTE that we need to rotate all the coordinates as + * find_blob_limits finds the y min and max within a specified x band + *************************************************************************/ +TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) { + float baseline; + float blob_x_centre; + float left_limit; + float right_limit; + float junk; + TBOX blob_box; + + /* Find baseline of centre of blob */ + + blob_box = blob->bounding_box(); + blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0; + baseline = row->baseline.y(blob_x_centre); + + /* +Find LH limit of blob ABOVE the xht. This is so that we can detect certain +caps ht chars which should NOT have their box reduced: T, Y, V, W etc +*/ + left_limit = static_cast<float>(INT32_MAX); + junk = static_cast<float>(-INT32_MAX); + find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX), + left_limit, junk); + if (left_limit > junk) { + *left_above_xht = INT16_MAX; // No area above xht + } else { + *left_above_xht = static_cast<int16_t>(std::floor(left_limit)); + } + /* +Find reduced LH limit of blob - the left extent of the region ABOVE the +baseline. +*/ + left_limit = static_cast<float>(INT32_MAX); + junk = static_cast<float>(-INT32_MAX); + find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk); + + if (left_limit > junk) { + return TBOX(); // no area within xht so return empty box + } + /* +Find reduced RH limit of blob - the right extent of the region BELOW the xht. +*/ + junk = static_cast<float>(INT32_MAX); + right_limit = static_cast<float>(-INT32_MAX); + find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk, + right_limit); + if (junk > right_limit) { + return TBOX(); // no area within xht so return empty box + } + + return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()), + ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top())); +} +} // namespace tesseract
