Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/topitch.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/topitch.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1742 @@ +/********************************************************************** + * File: topitch.cpp (Formerly to_pitch.c) + * Description: Code to determine fixed pitchness and the pitch if fixed. + * Author: Ray Smith + * + * (C) Copyright 1993, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "topitch.h" + +#include "blobbox.h" +#include "drawtord.h" +#include "makerow.h" +#include "pithsync.h" +#include "pitsync1.h" +#include "statistc.h" +#include "tovars.h" +#include "wordseg.h" + +#include "helpers.h" + +#include <memory> + +namespace tesseract { + +static BOOL_VAR(textord_all_prop, false, "All doc is proportial text"); +BOOL_VAR(textord_debug_pitch_test, false, "Debug on fixed pitch test"); +static BOOL_VAR(textord_disable_pitch_test, false, "Turn off dp fixed pitch algorithm"); +BOOL_VAR(textord_fast_pitch_test, false, "Do even faster pitch algorithm"); +BOOL_VAR(textord_debug_pitch_metric, false, "Write full metric stuff"); +BOOL_VAR(textord_show_row_cuts, false, "Draw row-level cuts"); +BOOL_VAR(textord_show_page_cuts, false, "Draw page-level cuts"); +BOOL_VAR(textord_blockndoc_fixed, false, "Attempt whole doc/block fixed pitch"); +double_VAR(textord_projection_scale, 0.200, "Ding rate for mid-cuts"); +double_VAR(textord_balance_factor, 1.0, "Ding rate for unbalanced char cells"); + +#define BLOCK_STATS_CLUSTERS 10 +#define MAX_ALLOWED_PITCH 100 // max pixel pitch. + +// qsort function to sort 2 floats. +static int sort_floats(const void *arg1, const void *arg2) { + float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2); + if (diff > 0) { + return 1; + } else if (diff < 0) { + return -1; + } else { + return 0; + } +} + +/********************************************************************** + * compute_fixed_pitch + * + * Decide whether each row is fixed pitch individually. + * Correlate definite and uncertain results to obtain an individual + * result for each row in the TO_ROW class. + **********************************************************************/ + +void compute_fixed_pitch(ICOORD page_tr, // top right + TO_BLOCK_LIST *port_blocks, // input list + float gradient, // page skew + FCOORD rotation, // for drawing + bool testing_on) { // correct orientation + TO_BLOCK_IT block_it; // iterator + TO_BLOCK *block; // current block; + TO_ROW *row; // current row + int block_index; // block number + int row_index; // row number + +#ifndef GRAPHICS_DISABLED + if (textord_show_initial_words && testing_on) { + if (to_win == nullptr) { + create_to_win(page_tr); + } + } +#endif + + block_it.set_to_list(port_blocks); + block_index = 1; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + compute_block_pitch(block, rotation, block_index, testing_on); + block_index++; + } + + if (!try_doc_fixed(page_tr, port_blocks, gradient)) { + block_index = 1; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + if (!try_block_fixed(block, block_index)) { + try_rows_fixed(block, block_index, testing_on); + } + block_index++; + } + } + + block_index = 1; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + POLY_BLOCK *pb = block->block->pdblk.poly_block(); + if (pb != nullptr && !pb->IsText()) { + continue; // Non-text doesn't exist! + } + // row iterator + TO_ROW_IT row_it(block->get_rows()); + row_index = 1; + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + fix_row_pitch(row, block, port_blocks, row_index, block_index); + row_index++; + } + block_index++; + } +#ifndef GRAPHICS_DISABLED + if (textord_show_initial_words && testing_on) { + ScrollView::Update(); + } +#endif +} + +/********************************************************************** + * fix_row_pitch + * + * Get a pitch_decision for this row by voting among similar rows in the + * block, then similar rows over all the page, or any other rows at all. + **********************************************************************/ + +void fix_row_pitch(TO_ROW *bad_row, // row to fix + TO_BLOCK *bad_block, // block of bad_row + TO_BLOCK_LIST *blocks, // blocks to scan + int32_t row_target, // number of row + int32_t block_target) { // number of block + int16_t mid_cuts; + int block_votes; // votes in block + int like_votes; // votes over page + int other_votes; // votes of unlike blocks + int block_index; // number of block + int maxwidth; // max pitch + TO_BLOCK_IT block_it = blocks; // block iterator + TO_BLOCK *block; // current block + TO_ROW *row; // current row + float sp_sd; // space deviation + STATS block_stats; // pitches in block + STATS like_stats; // pitches in page + + block_votes = like_votes = other_votes = 0; + maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace)); + if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) { + block_stats.set_range(0, maxwidth - 1); + like_stats.set_range(0, maxwidth - 1); + block_index = 1; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + POLY_BLOCK *pb = block->block->pdblk.poly_block(); + if (pb != nullptr && !pb->IsText()) { + continue; // Non text doesn't exist! + } + TO_ROW_IT row_it(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if ((bad_row->all_caps && + row->xheight + row->ascrise < + (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) && + row->xheight + row->ascrise > + (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) || + (!bad_row->all_caps && + row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) && + row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) { + if (block_index == block_target) { + if (row->pitch_decision == PITCH_DEF_FIXED) { + block_votes += textord_words_veto_power; + block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power); + } else if (row->pitch_decision == PITCH_MAYBE_FIXED || + row->pitch_decision == PITCH_CORR_FIXED) { + block_votes++; + block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1); + } else if (row->pitch_decision == PITCH_DEF_PROP) { + block_votes -= textord_words_veto_power; + } else if (row->pitch_decision == PITCH_MAYBE_PROP || + row->pitch_decision == PITCH_CORR_PROP) { + block_votes--; + } + } else { + if (row->pitch_decision == PITCH_DEF_FIXED) { + like_votes += textord_words_veto_power; + like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power); + } else if (row->pitch_decision == PITCH_MAYBE_FIXED || + row->pitch_decision == PITCH_CORR_FIXED) { + like_votes++; + like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1); + } else if (row->pitch_decision == PITCH_DEF_PROP) { + like_votes -= textord_words_veto_power; + } else if (row->pitch_decision == PITCH_MAYBE_PROP || + row->pitch_decision == PITCH_CORR_PROP) { + like_votes--; + } + } + } else { + if (row->pitch_decision == PITCH_DEF_FIXED) { + other_votes += textord_words_veto_power; + } else if (row->pitch_decision == PITCH_MAYBE_FIXED || + row->pitch_decision == PITCH_CORR_FIXED) { + other_votes++; + } else if (row->pitch_decision == PITCH_DEF_PROP) { + other_votes -= textord_words_veto_power; + } else if (row->pitch_decision == PITCH_MAYBE_PROP || + row->pitch_decision == PITCH_CORR_PROP) { + other_votes--; + } + } + } + block_index++; + } + if (block_votes > textord_words_veto_power) { + bad_row->fixed_pitch = block_stats.ile(0.5); + bad_row->pitch_decision = PITCH_CORR_FIXED; + } else if (block_votes <= textord_words_veto_power && like_votes > 0) { + bad_row->fixed_pitch = like_stats.ile(0.5); + bad_row->pitch_decision = PITCH_CORR_FIXED; + } else { + bad_row->pitch_decision = PITCH_CORR_PROP; + if (block_votes == 0 && like_votes == 0 && other_votes > 0 && + (textord_debug_pitch_test || textord_debug_pitch_metric)) { + tprintf( + "Warning:row %d of block %d set prop with no like rows against " + "trend\n", + row_target, block_target); + } + } + } + if (textord_debug_pitch_metric) { + tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes); + tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise); + } + if (bad_row->pitch_decision == PITCH_CORR_FIXED) { + if (bad_row->fixed_pitch < textord_min_xheight) { + if (block_votes > 0) { + bad_row->fixed_pitch = block_stats.ile(0.5); + } else if (block_votes == 0 && like_votes > 0) { + bad_row->fixed_pitch = like_stats.ile(0.5); + } else { + tprintf("Warning:guessing pitch as xheight on row %d, block %d\n", row_target, + block_target); + bad_row->fixed_pitch = bad_row->xheight; + } + } + if (bad_row->fixed_pitch < textord_min_xheight) { + bad_row->fixed_pitch = (float)textord_min_xheight; + } + bad_row->kern_size = bad_row->fixed_pitch / 4; + bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6); + bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4); + bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2; + bad_row->space_size = bad_row->fixed_pitch; + if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) { + tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left, + bad_row->projection_right, + (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, + sp_sd, mid_cuts, &bad_row->char_cells, false); + } + } else if (bad_row->pitch_decision == PITCH_CORR_PROP || + bad_row->pitch_decision == PITCH_DEF_PROP) { + bad_row->fixed_pitch = 0.0f; + bad_row->char_cells.clear(); + } +} + +/********************************************************************** + * compute_block_pitch + * + * Decide whether each block is fixed pitch individually. + **********************************************************************/ + +void compute_block_pitch(TO_BLOCK *block, // input list + FCOORD rotation, // for drawing + int32_t block_index, // block number + bool testing_on) { // correct orientation + TBOX block_box; // bounding box + + block_box = block->block->pdblk.bounding_box(); + if (testing_on && textord_debug_pitch_test) { + tprintf("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left(), block_box.bottom(), + block_box.right(), block_box.top()); + } + block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace)); + block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace)); + block->fixed_pitch = 0.0f; + block->space_size = static_cast<float>(block->min_space); + block->kern_size = static_cast<float>(block->max_nonspace); + block->pr_nonsp = block->xheight * words_default_prop_nonspace; + block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop; + if (!block->get_rows()->empty()) { + ASSERT_HOST(block->xheight > 0); + find_repeated_chars(block, textord_show_initial_words && testing_on); +#ifndef GRAPHICS_DISABLED + if (textord_show_initial_words && testing_on) { + // overlap_picture_ops(true); + ScrollView::Update(); + } +#endif + compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on); + } +} + +/********************************************************************** + * compute_rows_pitch + * + * Decide whether each row is fixed pitch individually. + **********************************************************************/ + +bool compute_rows_pitch( // find line stats + TO_BLOCK *block, // block to do + int32_t block_index, // block number + bool testing_on // correct orientation +) { + int32_t maxwidth; // of spaces + TO_ROW *row; // current row + int32_t row_index; // row number. + float lower, upper; // cluster thresholds + TO_ROW_IT row_it = block->get_rows(); + + row_index = 1; + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + ASSERT_HOST(row->xheight > 0); + row->compute_vertical_projection(); + maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace)); + if (row_pitch_stats(row, maxwidth, testing_on) && + find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index, + testing_on)) { + if (row->fixed_pitch == 0) { + lower = row->pr_nonsp; + upper = row->pr_space; + row->space_size = upper; + row->kern_size = lower; + } + } else { + row->fixed_pitch = 0.0f; // insufficient data + row->pitch_decision = PITCH_DUNNO; + } + row_index++; + } + return false; +} + +/********************************************************************** + * try_doc_fixed + * + * Attempt to call the entire document fixed pitch. + **********************************************************************/ + +bool try_doc_fixed( // determine pitch + ICOORD page_tr, // top right + TO_BLOCK_LIST *port_blocks, // input list + float gradient // page skew +) { + int16_t master_x; // uniform shifts + int16_t pitch; // median pitch. + int x; // profile coord + int prop_blocks; // correct counts + int fixed_blocks; + int total_row_count; // total in page + // iterator + TO_BLOCK_IT block_it = port_blocks; + TO_BLOCK *block; // current block; + TO_ROW *row; // current row + int16_t projection_left; // edges + int16_t projection_right; + int16_t row_left; // edges of row + int16_t row_right; + float master_y; // uniform shifts + float shift_factor; // page skew correction + float final_pitch; // output pitch + float row_y; // baseline + STATS projection; // entire page + STATS pitches(0, MAX_ALLOWED_PITCH - 1); + // for median + float sp_sd; // space sd + int16_t mid_cuts; // no of cheap cuts + float pitch_sd; // sync rating + + if (!textord_blockndoc_fixed || + block_it.empty() || block_it.data()->get_rows()->empty()) { + return false; + } + shift_factor = gradient / (gradient * gradient + 1); + // row iterator + TO_ROW_IT row_it(block_it.data()->get_rows()); + master_x = row_it.data()->projection_left; + master_y = row_it.data()->baseline.y(master_x); + projection_left = INT16_MAX; + projection_right = -INT16_MAX; + prop_blocks = 0; + fixed_blocks = 0; + total_row_count = 0; + + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + row_it.set_to_list(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + total_row_count++; + if (row->fixed_pitch > 0) { + pitches.add(static_cast<int32_t>(row->fixed_pitch), 1); + } + // find median + row_y = row->baseline.y(master_x); + row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y)); + row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y)); + if (row_left < projection_left) { + projection_left = row_left; + } + if (row_right > projection_right) { + projection_right = row_right; + } + } + } + if (pitches.get_total() == 0) { + return false; + } + projection.set_range(projection_left, projection_right - 1); + + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + row_it.set_to_list(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + row_y = row->baseline.y(master_x); + row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y)); + for (x = row->projection_left; x < row->projection_right; x++, row_left++) { + projection.add(row_left, row->projection.pile_count(x)); + } + } + } + + row_it.set_to_list(block_it.data()->get_rows()); + row = row_it.data(); +#ifndef GRAPHICS_DISABLED + if (textord_show_page_cuts && to_win != nullptr) { + projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL); + } +#endif + final_pitch = pitches.ile(0.5); + pitch = static_cast<int16_t>(final_pitch); + pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75, + final_pitch, sp_sd, mid_cuts, &row->char_cells, false); + + if (textord_debug_pitch_metric) { + tprintf( + "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%" + "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n", + prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count, + pitch_sd / pitch, pitch_sd / total_row_count / pitch); + } + +#ifndef GRAPHICS_DISABLED + if (textord_show_page_cuts && to_win != nullptr) { + float row_shift; // shift for row + ICOORDELT_LIST *master_cells; // cells for page + master_cells = &row->char_cells; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + row_it.set_to_list(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + row_y = row->baseline.y(master_x); + row_shift = shift_factor * (master_y - row_y); + plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells); + } + } + } +#endif + row->char_cells.clear(); + return false; +} + +/********************************************************************** + * try_block_fixed + * + * Try to call the entire block fixed. + **********************************************************************/ + +bool try_block_fixed( // find line stats + TO_BLOCK *block, // block to do + int32_t block_index // block number +) { + return false; +} + +/********************************************************************** + * try_rows_fixed + * + * Decide whether each row is fixed pitch individually. + **********************************************************************/ + +bool try_rows_fixed( // find line stats + TO_BLOCK *block, // block to do + int32_t block_index, // block number + bool testing_on // correct orientation +) { + TO_ROW *row; // current row + int32_t def_fixed = 0; // counters + int32_t def_prop = 0; + int32_t maybe_fixed = 0; + int32_t maybe_prop = 0; + int32_t dunno = 0; + int32_t corr_fixed = 0; + int32_t corr_prop = 0; + float lower, upper; // cluster thresholds + TO_ROW_IT row_it = block->get_rows(); + + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + ASSERT_HOST(row->xheight > 0); + if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) { + if (row->fixed_pitch == 0) { + lower = row->pr_nonsp; + upper = row->pr_space; + row->space_size = upper; + row->kern_size = lower; + } + } + } + count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop, + dunno); + if (testing_on && + (textord_debug_pitch_test || textord_blocksall_prop || textord_blocksall_fixed)) { + tprintf("Initially:"); + print_block_counts(block, block_index); + } + if (def_fixed > def_prop * textord_words_veto_power) { + block->pitch_decision = PITCH_DEF_FIXED; + } else if (def_prop > def_fixed * textord_words_veto_power) { + block->pitch_decision = PITCH_DEF_PROP; + } else if (def_fixed > 0 || def_prop > 0) { + block->pitch_decision = PITCH_DUNNO; + } else if (maybe_fixed > maybe_prop * textord_words_veto_power) { + block->pitch_decision = PITCH_MAYBE_FIXED; + } else if (maybe_prop > maybe_fixed * textord_words_veto_power) { + block->pitch_decision = PITCH_MAYBE_PROP; + } else { + block->pitch_decision = PITCH_DUNNO; + } + return false; +} + +/********************************************************************** + * print_block_counts + * + * Count up how many rows have what decision and print the results. + **********************************************************************/ + +void print_block_counts( // find line stats + TO_BLOCK *block, // block to do + int32_t block_index // block number +) { + int32_t def_fixed = 0; // counters + int32_t def_prop = 0; + int32_t maybe_fixed = 0; + int32_t maybe_prop = 0; + int32_t dunno = 0; + int32_t corr_fixed = 0; + int32_t corr_prop = 0; + + count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop, + dunno); + tprintf("Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed); + if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) { + tprintf(" (Wrongly)"); + } + tprintf(" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop); + if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) { + tprintf(" (Wrongly)"); + } + tprintf(" prop, %d dunno\n", dunno); +} + +/********************************************************************** + * count_block_votes + * + * Count the number of rows in the block with each kind of pitch_decision. + **********************************************************************/ + +void count_block_votes( // find line stats + TO_BLOCK *block, // block to do + int32_t &def_fixed, // add to counts + int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, + int32_t &corr_prop, int32_t &dunno) { + TO_ROW *row; // current row + TO_ROW_IT row_it = block->get_rows(); + + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + switch (row->pitch_decision) { + case PITCH_DUNNO: + dunno++; + break; + case PITCH_DEF_PROP: + def_prop++; + break; + case PITCH_MAYBE_PROP: + maybe_prop++; + break; + case PITCH_DEF_FIXED: + def_fixed++; + break; + case PITCH_MAYBE_FIXED: + maybe_fixed++; + break; + case PITCH_CORR_PROP: + corr_prop++; + break; + case PITCH_CORR_FIXED: + corr_fixed++; + break; + } + } +} + +/********************************************************************** + * row_pitch_stats + * + * Decide whether each row is fixed pitch individually. + **********************************************************************/ + +bool row_pitch_stats( // find line stats + TO_ROW *row, // current row + int32_t maxwidth, // of spaces + bool testing_on // correct orientation +) { + BLOBNBOX *blob; // current blob + int gap_index; // current gap + int32_t prev_x; // end of prev blob + int32_t cluster_count; // no of clusters + int32_t prev_count; // of clusters + int32_t smooth_factor; // for smoothing stats + TBOX blob_box; // bounding box + float lower, upper; // cluster thresholds + // gap sizes + float gaps[BLOCK_STATS_CLUSTERS]; + // blobs + BLOBNBOX_IT blob_it = row->blob_list(); + STATS gap_stats(0, maxwidth - 1); + STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; + // clusters + + smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5); + if (!blob_it.empty()) { + prev_x = blob_it.data()->bounding_box().right(); + blob_it.forward(); + while (!blob_it.at_first()) { + blob = blob_it.data(); + if (!blob->joined_to_prev()) { + blob_box = blob->bounding_box(); + if (blob_box.left() - prev_x < maxwidth) { + gap_stats.add(blob_box.left() - prev_x, 1); + } + prev_x = blob_box.right(); + } + blob_it.forward(); + } + } + if (gap_stats.get_total() == 0) { + return false; + } + cluster_count = 0; + lower = row->xheight * words_initial_lower; + upper = row->xheight * words_initial_upper; + gap_stats.smooth(smooth_factor); + do { + prev_count = cluster_count; + cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, + BLOCK_STATS_CLUSTERS, cluster_stats); + } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); + if (cluster_count < 1) { + return false; + } + for (gap_index = 0; gap_index < cluster_count; gap_index++) { + gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); + } + // get medians + if (testing_on) { + tprintf("cluster_count=%d:", cluster_count); + for (gap_index = 0; gap_index < cluster_count; gap_index++) { + tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total()); + } + tprintf("\n"); + } + qsort(gaps, cluster_count, sizeof(float), sort_floats); + + // Try to find proportional non-space and space for row. + lower = row->xheight * words_default_prop_nonspace; + upper = row->xheight * textord_words_min_minspace; + for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) { + ; + } + if (gap_index == 0) { + if (testing_on) { + tprintf("No clusters below nonspace threshold!!\n"); + } + if (cluster_count > 1) { + row->pr_nonsp = gaps[0]; + row->pr_space = gaps[1]; + } else { + row->pr_nonsp = lower; + row->pr_space = gaps[0]; + } + } else { + row->pr_nonsp = gaps[gap_index - 1]; + while (gap_index < cluster_count && gaps[gap_index] < upper) { + gap_index++; + } + if (gap_index == cluster_count) { + if (testing_on) { + tprintf("No clusters above nonspace threshold!!\n"); + } + row->pr_space = lower * textord_spacesize_ratioprop; + } else { + row->pr_space = gaps[gap_index]; + } + } + + // Now try to find the fixed pitch space and non-space. + upper = row->xheight * words_default_fixed_space; + for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) { + ; + } + if (gap_index == 0) { + if (testing_on) { + tprintf("No clusters below space threshold!!\n"); + } + row->fp_nonsp = upper; + row->fp_space = gaps[0]; + } else { + row->fp_nonsp = gaps[gap_index - 1]; + if (gap_index == cluster_count) { + if (testing_on) { + tprintf("No clusters above space threshold!!\n"); + } + row->fp_space = row->xheight; + } else { + row->fp_space = gaps[gap_index]; + } + } + if (testing_on) { + tprintf( + "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, " + "fp_space=%g\n", + row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space); + } + return true; // computed some stats +} + +/********************************************************************** + * find_row_pitch + * + * Check to see if this row could be fixed pitch using the given spacings. + * Blobs with gaps smaller than the lower threshold are assumed to be one. + * The larger threshold is the word gap threshold. + **********************************************************************/ + +bool find_row_pitch( // find lines + TO_ROW *row, // row to do + int32_t maxwidth, // max permitted space + int32_t dm_gap, // ignorable gaps + TO_BLOCK *block, // block of row + int32_t block_index, // block_number + int32_t row_index, // number of row + bool testing_on // correct orientation +) { + bool used_dm_model; // looks like dot matrix + float min_space; // estimate threshold + float non_space; // gap size + float gap_iqr; // interquartile range + float pitch_iqr; + float dm_gap_iqr; // interquartile range + float dm_pitch_iqr; + float dm_pitch; // pitch with dm on + float pitch; // revised estimate + float initial_pitch; // guess at pitch + STATS gap_stats(0, maxwidth - 1); + // centre-centre + STATS pitch_stats(0, maxwidth - 1); + + row->fixed_pitch = 0.0f; + initial_pitch = row->fp_space; + if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) { + initial_pitch = row->xheight; // keep pitch decent + } + non_space = row->fp_nonsp; + if (non_space > initial_pitch) { + non_space = initial_pitch; + } + min_space = (initial_pitch + non_space) / 2; + + if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, + dm_gap)) { + dm_gap_iqr = 0.0001f; + dm_pitch_iqr = maxwidth * 2.0f; + dm_pitch = initial_pitch; + } else { + dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); + dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); + dm_pitch = pitch_stats.ile(0.5); + } + gap_stats.clear(); + pitch_stats.clear(); + if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) { + gap_iqr = 0.0001f; + pitch_iqr = maxwidth * 3.0f; + } else { + gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); + pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); + if (testing_on) { + tprintf( + "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, " + "pitch=%g\n", + initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5)); + } + initial_pitch = pitch_stats.ile(0.5); + if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, + initial_pitch, true, false, 0)) { + min_space = initial_pitch; + gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); + pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); + if (testing_on) { + tprintf( + "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, " + "pitch=%g\n", + initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5)); + } + initial_pitch = pitch_stats.ile(0.5); + } + } + if (textord_debug_pitch_metric) { + tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index, + row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr, + pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth + ? 'D' + : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M')); + } + if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) { + row->pitch_decision = PITCH_DUNNO; + if (textord_debug_pitch_metric) { + tprintf("\n"); + } + return false; // insufficient data + } + if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) { + if (testing_on) { + tprintf( + "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, " + "dm_gap_iqr=%g\n", + pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); + } + gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); + pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); + pitch = pitch_stats.ile(0.5); + used_dm_model = false; + } else { + if (testing_on) { + tprintf( + "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, " + "dm_gap_iqr=%g\n", + pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); + } + gap_iqr = dm_gap_iqr; + pitch_iqr = dm_pitch_iqr; + pitch = dm_pitch; + used_dm_model = true; + } + if (textord_debug_pitch_metric) { + tprintf("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch); + tprintf("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->xheight, + pitch_iqr < gap_iqr * textord_fpiqr_ratio && + pitch_iqr < block->xheight * textord_max_pitch_iqr && + pitch < block->xheight * textord_words_default_maxspace + ? 'F' + : 'P'); + } + if (pitch_iqr < gap_iqr * textord_fpiqr_ratio && + pitch_iqr < block->xheight * textord_max_pitch_iqr && + pitch < block->xheight * textord_words_default_maxspace) { + row->pitch_decision = PITCH_MAYBE_FIXED; + } else { + row->pitch_decision = PITCH_MAYBE_PROP; + } + row->fixed_pitch = pitch; + row->kern_size = gap_stats.ile(0.5); + row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2; + if (row->min_space > row->fixed_pitch) { + row->min_space = static_cast<int32_t>(row->fixed_pitch); + } + row->max_nonspace = row->min_space; + row->space_size = row->fixed_pitch; + row->space_threshold = (row->max_nonspace + row->min_space) / 2; + row->used_dm_model = used_dm_model; + return true; +} + +/********************************************************************** + * fixed_pitch_row + * + * Check to see if this row could be fixed pitch using the given spacings. + * Blobs with gaps smaller than the lower threshold are assumed to be one. + * The larger threshold is the word gap threshold. + **********************************************************************/ + +bool fixed_pitch_row(TO_ROW *row, // row to do + BLOCK *block, + int32_t block_index // block_number +) { + const char *res_string; // pitch result + int16_t mid_cuts; // no of cheap cuts + float non_space; // gap size + float pitch_sd; // error on pitch + float sp_sd = 0.0f; // space sd + + non_space = row->fp_nonsp; + if (non_space > row->fixed_pitch) { + non_space = row->fixed_pitch; + } + POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr; + if (textord_all_prop || (pb != nullptr && !pb->IsText())) { + // Set the decision to definitely proportional. + pitch_sd = textord_words_def_prop * row->fixed_pitch; + row->pitch_decision = PITCH_DEF_PROP; + } else { + pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right, + (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd, + mid_cuts, &row->char_cells, block_index == textord_debug_block); + if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch && + ((pitsync_linear_version & 3) < 3 || + ((pitsync_linear_version & 3) >= 3 && + (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) { + if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps && + ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) { + row->pitch_decision = PITCH_DEF_FIXED; + } else { + row->pitch_decision = PITCH_MAYBE_FIXED; + } + } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 || + pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) { + if (pitch_sd < textord_words_def_prop * row->fixed_pitch) { + row->pitch_decision = PITCH_MAYBE_PROP; + } else { + row->pitch_decision = PITCH_DEF_PROP; + } + } else { + row->pitch_decision = PITCH_DUNNO; + } + } + + if (textord_debug_pitch_metric) { + res_string = "??"; + switch (row->pitch_decision) { + case PITCH_DEF_PROP: + res_string = "DP"; + break; + case PITCH_MAYBE_PROP: + res_string = "MP"; + break; + case PITCH_DEF_FIXED: + res_string = "DF"; + break; + case PITCH_MAYBE_FIXED: + res_string = "MF"; + break; + default: + res_string = "??"; + } + tprintf(":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->fixed_pitch, sp_sd, res_string); + } + return true; +} + +/********************************************************************** + * count_pitch_stats + * + * Count up the gap and pitch stats on the block to see if it is fixed pitch. + * Blobs with gaps smaller than the lower threshold are assumed to be one. + * The larger threshold is the word gap threshold. + * The return value indicates whether there were any decent values to use. + **********************************************************************/ + +bool count_pitch_stats( // find lines + TO_ROW *row, // row to do + STATS *gap_stats, // blob gaps + STATS *pitch_stats, // centre-centre stats + float initial_pitch, // guess at pitch + float min_space, // estimate space size + bool ignore_outsize, // discard big objects + bool split_outsize, // split big objects + int32_t dm_gap // ignorable gaps +) { + bool prev_valid; // not word broken + BLOBNBOX *blob; // current blob + // blobs + BLOBNBOX_IT blob_it = row->blob_list(); + int32_t prev_right; // end of prev blob + int32_t prev_centre; // centre of previous blob + int32_t x_centre; // centre of this blob + int32_t blob_width; // width of blob + int32_t width_units; // no of widths in blob + float width; // blob width + TBOX blob_box; // bounding box + TBOX joined_box; // of super blob + + gap_stats->clear(); + pitch_stats->clear(); + if (blob_it.empty()) { + return false; + } + prev_valid = false; + prev_centre = 0; + prev_right = 0; // stop compiler warning + joined_box = blob_it.data()->bounding_box(); + do { + blob_it.forward(); + blob = blob_it.data(); + if (!blob->joined_to_prev()) { + blob_box = blob->bounding_box(); + if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) || + blob->cblob() == nullptr) { + joined_box += blob_box; // merge blobs + } else { + blob_width = joined_box.width(); + if (split_outsize) { + width_units = + static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5)); + if (width_units < 1) { + width_units = 1; + } + width_units--; + } else if (ignore_outsize) { + width = static_cast<float>(blob_width) / initial_pitch; + width_units = + width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0 + : -1; + } else { + width_units = 0; // everything in + } + x_centre = static_cast<int32_t>(joined_box.left() + + (blob_width - width_units * initial_pitch) / 2); + if (prev_valid && width_units >= 0) { + // if (width_units>0) + // { + // tprintf("wu=%d, + // width=%d, + // xc=%d, adding + // %d\n", + // width_units,blob_width,x_centre,x_centre-prev_centre); + // } + gap_stats->add(joined_box.left() - prev_right, 1); + pitch_stats->add(x_centre - prev_centre, 1); + } + prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch); + prev_right = joined_box.right(); + prev_valid = blob_box.left() - joined_box.right() < min_space; + prev_valid = prev_valid && width_units >= 0; + joined_box = blob_box; + } + } + } while (!blob_it.at_first()); + return gap_stats->get_total() >= 3; +} + +/********************************************************************** + * tune_row_pitch + * + * Use a dp algorithm to fit the character cells and return the sd of + * the cell size over the row. + **********************************************************************/ + +float tune_row_pitch( // find fp cells + TO_ROW *row, // row to do + STATS *projection, // vertical projection + int16_t projection_left, // edge of projection + int16_t projection_right, // edge of projection + float space_size, // size of blank + float &initial_pitch, // guess at pitch + float &best_sp_sd, // space sd + int16_t &best_mid_cuts, // no of cheap cuts + ICOORDELT_LIST *best_cells, // row cells + bool testing_on // individual words +) { + int pitch_delta; // offset pitch + int16_t mid_cuts; // cheap cuts + float pitch_sd; // current sd + float best_sd; // best result + float best_pitch; // pitch for best result + float initial_sd; // starting error + float sp_sd; // space sd + ICOORDELT_LIST test_cells; // row cells + ICOORDELT_IT best_it; // start of best list + + if (textord_fast_pitch_test) { + return tune_row_pitch2(row, projection, projection_left, projection_right, space_size, + initial_pitch, best_sp_sd, + // space sd + best_mid_cuts, best_cells, testing_on); + } + if (textord_disable_pitch_test) { + best_sp_sd = initial_pitch; + return initial_pitch; + } + initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size, + initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on); + best_sd = initial_sd; + best_pitch = initial_pitch; + if (testing_on) { + tprintf("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd); + } + for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { + pitch_sd = + compute_pitch_sd(row, projection, projection_left, projection_right, space_size, + initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on); + if (testing_on) { + tprintf("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd); + } + if (pitch_sd < best_sd) { + best_sd = pitch_sd; + best_mid_cuts = mid_cuts; + best_sp_sd = sp_sd; + best_pitch = initial_pitch + pitch_delta; + best_cells->clear(); + best_it.set_to_list(best_cells); + best_it.add_list_after(&test_cells); + } else { + test_cells.clear(); + } + if (pitch_sd > initial_sd) { + break; // getting worse + } + } + for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { + pitch_sd = + compute_pitch_sd(row, projection, projection_left, projection_right, space_size, + initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on); + if (testing_on) { + tprintf("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd); + } + if (pitch_sd < best_sd) { + best_sd = pitch_sd; + best_mid_cuts = mid_cuts; + best_sp_sd = sp_sd; + best_pitch = initial_pitch - pitch_delta; + best_cells->clear(); + best_it.set_to_list(best_cells); + best_it.add_list_after(&test_cells); + } else { + test_cells.clear(); + } + if (pitch_sd > initial_sd) { + break; + } + } + initial_pitch = best_pitch; + + if (textord_debug_pitch_metric) { + print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch); + } + + return best_sd; +} + +/********************************************************************** + * tune_row_pitch + * + * Use a dp algorithm to fit the character cells and return the sd of + * the cell size over the row. + **********************************************************************/ + +float tune_row_pitch2( // find fp cells + TO_ROW *row, // row to do + STATS *projection, // vertical projection + int16_t projection_left, // edge of projection + int16_t projection_right, // edge of projection + float space_size, // size of blank + float &initial_pitch, // guess at pitch + float &best_sp_sd, // space sd + int16_t &best_mid_cuts, // no of cheap cuts + ICOORDELT_LIST *best_cells, // row cells + bool testing_on // individual words +) { + int pitch_delta; // offset pitch + int16_t pixel; // pixel coord + int16_t best_pixel; // pixel coord + int16_t best_delta; // best pitch + int16_t best_pitch; // best pitch + int16_t start; // of good range + int16_t end; // of good range + int32_t best_count; // lowest sum + float best_sd; // best result + + best_sp_sd = initial_pitch; + + best_pitch = static_cast<int>(initial_pitch); + if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) { + return initial_pitch; + } + std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection + + for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) { + sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta); + } + for (pixel = projection_left; pixel <= projection_right; pixel++) { + for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) { + sum_proj[textord_pitch_range + pitch_delta].add( + (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel)); + } + } + best_count = sum_proj[textord_pitch_range].pile_count(0); + best_delta = 0; + best_pixel = 0; + for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) { + for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) { + if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) { + best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel); + best_delta = pitch_delta; + best_pixel = pixel; + } + } + } + if (testing_on) { + tprintf("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta, + best_count); + } + best_pitch += best_delta; + initial_pitch = best_pitch; + best_count++; + best_count += best_count; + for (start = best_pixel - 2; + start > best_pixel - best_pitch && + sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count; + start--) { + ; + } + for (end = best_pixel + 2; + end < best_pixel + best_pitch && + sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count; + end++) { + ; + } + + best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size, + initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on, + start, end); + if (testing_on) { + tprintf("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd); + } + + if (textord_debug_pitch_metric) { + print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch); + } + + return best_sd; +} + +/********************************************************************** + * compute_pitch_sd + * + * Use a dp algorithm to fit the character cells and return the sd of + * the cell size over the row. + **********************************************************************/ + +float compute_pitch_sd( // find fp cells + TO_ROW *row, // row to do + STATS *projection, // vertical projection + int16_t projection_left, // edge + int16_t projection_right, // edge + float space_size, // size of blank + float initial_pitch, // guess at pitch + float &sp_sd, // space sd + int16_t &mid_cuts, // no of free cuts + ICOORDELT_LIST *row_cells, // list of chop pts + bool testing_on, // individual words + int16_t start, // start of good range + int16_t end // end of good range +) { + int16_t occupation; // no of cells in word. + // blobs + BLOBNBOX_IT blob_it = row->blob_list(); + BLOBNBOX_IT start_it; // start of word + BLOBNBOX_IT plot_it; // for plotting + int16_t blob_count; // no of blobs + TBOX blob_box; // bounding box + TBOX prev_box; // of super blob + int32_t prev_right; // of word sync + int scale_factor; // on scores for big words + int32_t sp_count; // spaces + FPSEGPT_LIST seg_list; // char cells + FPSEGPT_IT seg_it; // iterator + int16_t segpos; // position of segment + int16_t cellpos; // previous cell boundary + // iterator + ICOORDELT_IT cell_it = row_cells; + ICOORDELT *cell; // new cell + double sqsum; // sum of squares + double spsum; // of spaces + double sp_var; // space error + double word_sync; // result for word + int32_t total_count; // total blobs + + if ((pitsync_linear_version & 3) > 1) { + word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch, + occupation, mid_cuts, row_cells, testing_on, start, end); + sp_sd = occupation; + return word_sync; + } + mid_cuts = 0; + cellpos = 0; + total_count = 0; + sqsum = 0; + sp_count = 0; + spsum = 0; + prev_right = -1; + if (blob_it.empty()) { + return space_size * 10; + } +#ifndef GRAPHICS_DISABLED + if (testing_on && to_win != nullptr) { + blob_box = blob_it.data()->bounding_box(); + projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL); + } +#endif + start_it = blob_it; + blob_count = 0; + blob_box = box_next(&blob_it); // first blob + blob_it.mark_cycle_pt(); + do { + for (; blob_count > 0; blob_count--) { + box_next(&start_it); + } + do { + prev_box = blob_box; + blob_count++; + blob_box = box_next(&blob_it); + } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size); + plot_it = start_it; + if (pitsync_linear_version & 3) { + word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, + projection, projection_left, projection_right, + row->xheight * textord_projection_scale, occupation, &seg_list, + start, end); + } else { + word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, + projection, &seg_list); + } + if (testing_on) { + tprintf("Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.right(), prev_box.top(), + seg_list.length() - 1, word_sync); + seg_it.set_to_list(&seg_list); + for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { + if (seg_it.data()->faked) { + tprintf("(F)"); + } + tprintf("%d, ", seg_it.data()->position()); + // tprintf("C=%g, s=%g, sq=%g\n", + // seg_it.data()->cost_function(), + // seg_it.data()->sum(), + // seg_it.data()->squares()); + } + tprintf("\n"); + } +#ifndef GRAPHICS_DISABLED + if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) { + plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); + } +#endif + seg_it.set_to_list(&seg_list); + if (prev_right >= 0) { + sp_var = seg_it.data()->position() - prev_right; + sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch; + sp_var *= sp_var; + spsum += sp_var; + sp_count++; + } + for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { + segpos = seg_it.data()->position(); + if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) { + // big gap + while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) { + cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0); + cell_it.add_after_then_move(cell); + cellpos += static_cast<int16_t>(initial_pitch); + } + // make new one + cell = new ICOORDELT(segpos, 0); + cell_it.add_after_then_move(cell); + cellpos = segpos; + } else if (segpos > cellpos - initial_pitch / 2) { + cell = cell_it.data(); + // average positions + cell->set_x((cellpos + segpos) / 2); + cellpos = cell->x(); + } + } + seg_it.move_to_last(); + prev_right = seg_it.data()->position(); + if (textord_pitch_scalebigwords) { + scale_factor = (seg_list.length() - 2) / 2; + if (scale_factor < 1) { + scale_factor = 1; + } + } else { + scale_factor = 1; + } + sqsum += word_sync * scale_factor; + total_count += (seg_list.length() - 1) * scale_factor; + seg_list.clear(); + } while (!blob_it.cycled_list()); + sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0; + return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10; +} + +/********************************************************************** + * compute_pitch_sd2 + * + * Use a dp algorithm to fit the character cells and return the sd of + * the cell size over the row. + **********************************************************************/ + +float compute_pitch_sd2( // find fp cells + TO_ROW *row, // row to do + STATS *projection, // vertical projection + int16_t projection_left, // edge + int16_t projection_right, // edge + float initial_pitch, // guess at pitch + int16_t &occupation, // no of occupied cells + int16_t &mid_cuts, // no of free cuts + ICOORDELT_LIST *row_cells, // list of chop pts + bool testing_on, // individual words + int16_t start, // start of good range + int16_t end // end of good range +) { + // blobs + BLOBNBOX_IT blob_it = row->blob_list(); + BLOBNBOX_IT plot_it; + int16_t blob_count; // no of blobs + TBOX blob_box; // bounding box + FPSEGPT_LIST seg_list; // char cells + FPSEGPT_IT seg_it; // iterator + int16_t segpos; // position of segment + // iterator + ICOORDELT_IT cell_it = row_cells; + ICOORDELT *cell; // new cell + double word_sync; // result for word + + mid_cuts = 0; + if (blob_it.empty()) { + occupation = 0; + return initial_pitch * 10; + } +#ifndef GRAPHICS_DISABLED + if (testing_on && to_win != nullptr) { + projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL); + } +#endif + blob_count = 0; + blob_it.mark_cycle_pt(); + do { + // first blob + blob_box = box_next(&blob_it); + blob_count++; + } while (!blob_it.cycled_list()); + plot_it = blob_it; + word_sync = check_pitch_sync2( + &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left, + projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end); + if (testing_on) { + tprintf("Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.right(), blob_box.top(), + seg_list.length() - 1, word_sync); + seg_it.set_to_list(&seg_list); + for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { + if (seg_it.data()->faked) { + tprintf("(F)"); + } + tprintf("%d, ", seg_it.data()->position()); + // tprintf("C=%g, s=%g, sq=%g\n", + // seg_it.data()->cost_function(), + // seg_it.data()->sum(), + // seg_it.data()->squares()); + } + tprintf("\n"); + } +#ifndef GRAPHICS_DISABLED + if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) { + plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); + } +#endif + seg_it.set_to_list(&seg_list); + for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { + segpos = seg_it.data()->position(); + // make new one + cell = new ICOORDELT(segpos, 0); + cell_it.add_after_then_move(cell); + if (seg_it.at_last()) { + mid_cuts = seg_it.data()->cheap_cuts(); + } + } + seg_list.clear(); + return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10; +} + +/********************************************************************** + * print_pitch_sd + * + * Use a dp algorithm to fit the character cells and return the sd of + * the cell size over the row. + **********************************************************************/ + +void print_pitch_sd( // find fp cells + TO_ROW *row, // row to do + STATS *projection, // vertical projection + int16_t projection_left, // edges //size of blank + int16_t projection_right, float space_size, + float initial_pitch // guess at pitch +) { + const char *res2; // pitch result + int16_t occupation; // used cells + float sp_sd; // space sd + // blobs + BLOBNBOX_IT blob_it = row->blob_list(); + BLOBNBOX_IT start_it; // start of word + BLOBNBOX_IT row_start; // start of row + int16_t blob_count; // no of blobs + int16_t total_blob_count; // total blobs in line + TBOX blob_box; // bounding box + TBOX prev_box; // of super blob + int32_t prev_right; // of word sync + int scale_factor; // on scores for big words + int32_t sp_count; // spaces + FPSEGPT_LIST seg_list; // char cells + FPSEGPT_IT seg_it; // iterator + double sqsum; // sum of squares + double spsum; // of spaces + double sp_var; // space error + double word_sync; // result for word + double total_count; // total cuts + + if (blob_it.empty()) { + return; + } + row_start = blob_it; + total_blob_count = 0; + + total_count = 0; + sqsum = 0; + sp_count = 0; + spsum = 0; + prev_right = -1; + blob_it = row_start; + start_it = blob_it; + blob_count = 0; + blob_box = box_next(&blob_it); // first blob + blob_it.mark_cycle_pt(); + do { + for (; blob_count > 0; blob_count--) { + box_next(&start_it); + } + do { + prev_box = blob_box; + blob_count++; + blob_box = box_next(&blob_it); + } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size); + word_sync = check_pitch_sync2( + &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left, + projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0); + total_blob_count += blob_count; + seg_it.set_to_list(&seg_list); + if (prev_right >= 0) { + sp_var = seg_it.data()->position() - prev_right; + sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch; + sp_var *= sp_var; + spsum += sp_var; + sp_count++; + } + seg_it.move_to_last(); + prev_right = seg_it.data()->position(); + if (textord_pitch_scalebigwords) { + scale_factor = (seg_list.length() - 2) / 2; + if (scale_factor < 1) { + scale_factor = 1; + } + } else { + scale_factor = 1; + } + sqsum += word_sync * scale_factor; + total_count += (seg_list.length() - 1) * scale_factor; + seg_list.clear(); + } while (!blob_it.cycled_list()); + sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0; + word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10; + tprintf("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd, + word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P'); + + start_it = row_start; + blob_it = row_start; + word_sync = + check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2, + projection, projection_left, projection_right, + row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0); + if (occupation > 1) { + word_sync /= occupation; + } + word_sync = sqrt(word_sync); + +#ifndef GRAPHICS_DISABLED + if (textord_show_row_cuts && to_win != nullptr) { + plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list); + } +#endif + seg_list.clear(); + if (word_sync < textord_words_pitchsd_threshold * initial_pitch) { + if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) { + res2 = "DF"; + } else { + res2 = "MF"; + } + } else { + res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP"; + } + tprintf( + "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, " + "all_caps=%d\n", + word_sync, word_sync / initial_pitch, + word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2, + initial_pitch, row->fixed_pitch, row->all_caps); +} + +/********************************************************************** + * find_repeated_chars + * + * Extract marked leader blobs and put them + * into words in advance of fixed pitch checking and word generation. + **********************************************************************/ +void find_repeated_chars(TO_BLOCK *block, // Block to search. + bool testing_on) { // Debug mode. + POLY_BLOCK *pb = block->block->pdblk.poly_block(); + if (pb != nullptr && !pb->IsText()) { + return; // Don't find repeated chars in non-text blocks. + } + + TO_ROW *row; + BLOBNBOX_IT box_it; + BLOBNBOX_IT search_it; // forward search + WERD *word; // new word + TBOX word_box; // for plotting + int blobcount, repeated_set; + + TO_ROW_IT row_it = block->get_rows(); + if (row_it.empty()) { + return; // empty block + } + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + box_it.set_to_list(row->blob_list()); + if (box_it.empty()) { + continue; // no blobs in this row + } + if (!row->rep_chars_marked()) { + mark_repeated_chars(row); + } + if (row->num_repeated_sets() == 0) { + continue; // nothing to do for this row + } + // new words + WERD_IT word_it(&row->rep_words); + do { + if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) { + blobcount = 1; + repeated_set = box_it.data()->repeated_set(); + search_it = box_it; + search_it.forward(); + while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) { + blobcount++; + search_it.forward(); + } + // After the call to make_real_word() all the blobs from this + // repeated set will be removed from the blob list. box_it will be + // set to point to the blob after the end of the extracted sequence. + word = make_real_word(&box_it, blobcount, box_it.at_first(), 1); + if (!box_it.empty() && box_it.data()->joined_to_prev()) { + tprintf("Bad box joined to prev at"); + box_it.data()->bounding_box().print(); + tprintf("After repeated word:"); + word->bounding_box().print(); + } + ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev()); + word->set_flag(W_REP_CHAR, true); + word->set_flag(W_DONT_CHOP, true); + word_it.add_after_then_move(word); + } else { + box_it.forward(); + } + } while (!box_it.at_first()); + } +} + +/********************************************************************** + * plot_fp_word + * + * Plot a block of words as if fixed pitch. + **********************************************************************/ + +#ifndef GRAPHICS_DISABLED +void plot_fp_word( // draw block of words + TO_BLOCK *block, // block to draw + float pitch, // pitch to draw with + float nonspace // for space threshold +) { + TO_ROW *row; // current row + TO_ROW_IT row_it = block->get_rows(); + + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + row->min_space = static_cast<int32_t>((pitch + nonspace) / 2); + row->max_nonspace = row->min_space; + row->space_threshold = row->min_space; + plot_word_decisions(to_win, static_cast<int16_t>(pitch), row); + } +} +#endif + +} // namespace tesseract
