Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 2 // you may not use this file except in compliance with the License. | |
| 3 // You may obtain a copy of the License at | |
| 4 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 5 // Unless required by applicable law or agreed to in writing, software | |
| 6 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 8 // See the License for the specific language governing permissions and | |
| 9 // limitations under the License. | |
| 10 /********************************************************************** | |
| 11 * tospace.cpp | |
| 12 * | |
| 13 * Compute fuzzy word spacing thresholds for each row. | |
| 14 * I.e. set : max_nonspace | |
| 15 * space_threshold | |
| 16 * min_space | |
| 17 * kern_size | |
| 18 * space_size | |
| 19 * for each row. | |
| 20 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE | |
| 21 * | |
| 22 * Note: functions in this file were originally not members of any | |
| 23 * class or enclosed by any namespace. Now they are all static members | |
| 24 * of the Textord class. | |
| 25 * | |
| 26 **********************************************************************/ | |
| 27 | |
| 28 #include "drawtord.h" | |
| 29 #include "statistc.h" | |
| 30 #include "textord.h" | |
| 31 #include "tovars.h" | |
| 32 | |
| 33 // Include automatically generated configuration file if running autoconf. | |
| 34 #ifdef HAVE_CONFIG_H | |
| 35 # include "config_auto.h" | |
| 36 #endif | |
| 37 | |
| 38 #include <algorithm> | |
| 39 #include <cmath> | |
| 40 #include <memory> | |
| 41 | |
| 42 #define MAXSPACING 128 /*max expected spacing in pix */ | |
| 43 | |
| 44 namespace tesseract { | |
| 45 void Textord::to_spacing(ICOORD page_tr, // topright of page | |
| 46 TO_BLOCK_LIST *blocks // blocks on page | |
| 47 ) { | |
| 48 TO_BLOCK_IT block_it; // iterator | |
| 49 TO_BLOCK *block; // current block; | |
| 50 TO_ROW *row; // current row | |
| 51 int block_index; // block number | |
| 52 int row_index; // row number | |
| 53 // estimated width of real spaces for whole block | |
| 54 int16_t block_space_gap_width; | |
| 55 // estimated width of non space gaps for whole block | |
| 56 int16_t block_non_space_gap_width; | |
| 57 bool old_text_ord_proportional; // old fixed/prop result | |
| 58 | |
| 59 block_it.set_to_list(blocks); | |
| 60 block_index = 1; | |
| 61 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 62 block = block_it.data(); | |
| 63 std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk | |
| 64 block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width, | |
| 65 block_non_space_gap_width); | |
| 66 // Make sure relative values of block-level space and non-space gap | |
| 67 // widths are reasonable. The ratio of 1:3 is also used in | |
| 68 // block_spacing_stats, to correct the block_space_gap_width. | |
| 69 // Useful for arabic and hindi, when the non-space gap width is | |
| 70 // often over-estimated and should not be trusted. A similar ratio | |
| 71 // is found in block_spacing_stats. | |
| 72 if (tosp_old_to_method && tosp_old_to_constrain_sp_kn && | |
| 73 block_non_space_gap_width > block_space_gap_width / 3) { | |
| 74 block_non_space_gap_width = block_space_gap_width / 3; | |
| 75 } | |
| 76 // row iterator | |
| 77 TO_ROW_IT row_it(block->get_rows()); | |
| 78 row_index = 1; | |
| 79 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 80 row = row_it.data(); | |
| 81 if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) { | |
| 82 if ((tosp_debug_level > 0) && !old_text_ord_proportional) { | |
| 83 tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index); | |
| 84 } | |
| 85 row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width, | |
| 86 block_non_space_gap_width); | |
| 87 } else { | |
| 88 if ((tosp_debug_level > 0) && old_text_ord_proportional) { | |
| 89 tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index, | |
| 90 row_index, row->pitch_decision, row->fixed_pitch); | |
| 91 } | |
| 92 } | |
| 93 #ifndef GRAPHICS_DISABLED | |
| 94 if (textord_show_initial_words) { | |
| 95 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row); | |
| 96 } | |
| 97 #endif | |
| 98 row_index++; | |
| 99 } | |
| 100 block_index++; | |
| 101 } | |
| 102 } | |
| 103 | |
| 104 /************************************************************************* | |
| 105 * block_spacing_stats() | |
| 106 *************************************************************************/ | |
| 107 | |
| 108 void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional, | |
| 109 int16_t &block_space_gap_width, // resulting estimate | |
| 110 int16_t &block_non_space_gap_width // resulting estimate | |
| 111 ) { | |
| 112 TO_ROW *row; // current row | |
| 113 BLOBNBOX_IT blob_it; // iterator | |
| 114 | |
| 115 STATS centre_to_centre_stats(0, MAXSPACING - 1); | |
| 116 // DEBUG USE ONLY | |
| 117 STATS all_gap_stats(0, MAXSPACING - 1); | |
| 118 STATS space_gap_stats(0, MAXSPACING - 1); | |
| 119 int16_t minwidth = MAXSPACING; // narrowest blob | |
| 120 TBOX blob_box; | |
| 121 TBOX prev_blob_box; | |
| 122 int16_t centre_to_centre; | |
| 123 int16_t gap_width; | |
| 124 float real_space_threshold; | |
| 125 float iqr_centre_to_centre; // DEBUG USE ONLY | |
| 126 float iqr_all_gap_stats; // DEBUG USE ONLY | |
| 127 int32_t end_of_row; | |
| 128 int32_t row_length; | |
| 129 | |
| 130 // row iterator | |
| 131 TO_ROW_IT row_it(block->get_rows()); | |
| 132 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 133 row = row_it.data(); | |
| 134 if (!row->blob_list()->empty() && | |
| 135 (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) || | |
| 136 (row->pitch_decision == PITCH_CORR_PROP))) { | |
| 137 blob_it.set_to_list(row->blob_list()); | |
| 138 blob_it.mark_cycle_pt(); | |
| 139 end_of_row = blob_it.data_relative(-1)->bounding_box().right(); | |
| 140 if (tosp_use_pre_chopping) { | |
| 141 blob_box = box_next_pre_chopped(&blob_it); | |
| 142 } else if (tosp_stats_use_xht_gaps) { | |
| 143 blob_box = reduced_box_next(row, &blob_it); | |
| 144 } else { | |
| 145 blob_box = box_next(&blob_it); | |
| 146 } | |
| 147 row_length = end_of_row - blob_box.left(); | |
| 148 if (blob_box.width() < minwidth) { | |
| 149 minwidth = blob_box.width(); | |
| 150 } | |
| 151 prev_blob_box = blob_box; | |
| 152 while (!blob_it.cycled_list()) { | |
| 153 if (tosp_use_pre_chopping) { | |
| 154 blob_box = box_next_pre_chopped(&blob_it); | |
| 155 } else if (tosp_stats_use_xht_gaps) { | |
| 156 blob_box = reduced_box_next(row, &blob_it); | |
| 157 } else { | |
| 158 blob_box = box_next(&blob_it); | |
| 159 } | |
| 160 if (blob_box.width() < minwidth) { | |
| 161 minwidth = blob_box.width(); | |
| 162 } | |
| 163 int16_t left = prev_blob_box.right(); | |
| 164 int16_t right = blob_box.left(); | |
| 165 gap_width = right - left; | |
| 166 if (!ignore_big_gap(row, row_length, gapmap, left, right)) { | |
| 167 all_gap_stats.add(gap_width, 1); | |
| 168 | |
| 169 centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2; | |
| 170 // DEBUG | |
| 171 centre_to_centre_stats.add(centre_to_centre, 1); | |
| 172 // DEBUG | |
| 173 } | |
| 174 prev_blob_box = blob_box; | |
| 175 } | |
| 176 } | |
| 177 } | |
| 178 | |
| 179 // Inadequate samples | |
| 180 if (all_gap_stats.get_total() <= 1) { | |
| 181 block_non_space_gap_width = minwidth; | |
| 182 block_space_gap_width = -1; // No est. space width | |
| 183 // DEBUG | |
| 184 old_text_ord_proportional = true; | |
| 185 } else { | |
| 186 /* For debug only ..... */ | |
| 187 iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25); | |
| 188 iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25); | |
| 189 old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats; | |
| 190 /* .......For debug only */ | |
| 191 | |
| 192 /* | |
| 193 The median of the gaps is used as an estimate of the NON-SPACE gap width. | |
| 194 This RELIES on the assumption that there are more gaps WITHIN words than | |
| 195 BETWEEN words in a block | |
| 196 | |
| 197 Now try to estimate the width of a real space for all real spaces in the | |
| 198 block. Do this by using a crude threshold to ignore "narrow" gaps, then | |
| 199 find the median of the "wide" gaps and use this. | |
| 200 */ | |
| 201 block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median())); | |
| 202 // median gap | |
| 203 | |
| 204 row_it.set_to_list(block->get_rows()); | |
| 205 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 206 row = row_it.data(); | |
| 207 if (!row->blob_list()->empty() && | |
| 208 (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) || | |
| 209 (row->pitch_decision == PITCH_CORR_PROP))) { | |
| 210 real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width, | |
| 211 tosp_init_guess_xht_mult * row->xheight); | |
| 212 blob_it.set_to_list(row->blob_list()); | |
| 213 blob_it.mark_cycle_pt(); | |
| 214 end_of_row = blob_it.data_relative(-1)->bounding_box().right(); | |
| 215 if (tosp_use_pre_chopping) { | |
| 216 blob_box = box_next_pre_chopped(&blob_it); | |
| 217 } else if (tosp_stats_use_xht_gaps) { | |
| 218 blob_box = reduced_box_next(row, &blob_it); | |
| 219 } else { | |
| 220 blob_box = box_next(&blob_it); | |
| 221 } | |
| 222 row_length = blob_box.left() - end_of_row; | |
| 223 prev_blob_box = blob_box; | |
| 224 while (!blob_it.cycled_list()) { | |
| 225 if (tosp_use_pre_chopping) { | |
| 226 blob_box = box_next_pre_chopped(&blob_it); | |
| 227 } else if (tosp_stats_use_xht_gaps) { | |
| 228 blob_box = reduced_box_next(row, &blob_it); | |
| 229 } else { | |
| 230 blob_box = box_next(&blob_it); | |
| 231 } | |
| 232 int16_t left = prev_blob_box.right(); | |
| 233 int16_t right = blob_box.left(); | |
| 234 gap_width = right - left; | |
| 235 if ((gap_width > real_space_threshold) && | |
| 236 !ignore_big_gap(row, row_length, gapmap, left, right)) { | |
| 237 /* | |
| 238 If tosp_use_cert_spaces is enabled, the estimate of the space gap is | |
| 239 restricted to obvious spaces - those wider than half the xht or | |
| 240 those with wide blobs on both sides - i.e not things that are | |
| 241 suspect 1's or punctuation that is sometimes widely spaced. | |
| 242 */ | |
| 243 if (!tosp_block_use_cert_spaces || | |
| 244 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || | |
| 245 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && | |
| 246 (!tosp_narrow_blobs_not_cert || | |
| 247 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || | |
| 248 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { | |
| 249 space_gap_stats.add(gap_width, 1); | |
| 250 } | |
| 251 } | |
| 252 prev_blob_box = blob_box; | |
| 253 } | |
| 254 } | |
| 255 } | |
| 256 // Inadequate samples | |
| 257 if (space_gap_stats.get_total() <= 2) { | |
| 258 block_space_gap_width = -1; // No est. space width | |
| 259 } else { | |
| 260 block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())), | |
| 261 static_cast<int16_t>(3 * block_non_space_gap_width)); | |
| 262 } | |
| 263 } | |
| 264 } | |
| 265 | |
| 266 /************************************************************************* | |
| 267 * row_spacing_stats() | |
| 268 * Set values for min_space, max_non_space based on row stats only | |
| 269 * If failure - return 0 values. | |
| 270 *************************************************************************/ | |
| 271 void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx, | |
| 272 int16_t block_space_gap_width, // estimate for block | |
| 273 int16_t block_non_space_gap_width // estimate for block | |
| 274 ) { | |
| 275 // iterator | |
| 276 BLOBNBOX_IT blob_it = row->blob_list(); | |
| 277 STATS all_gap_stats(0, MAXSPACING - 1); | |
| 278 STATS cert_space_gap_stats(0, MAXSPACING - 1); | |
| 279 STATS all_space_gap_stats(0, MAXSPACING - 1); | |
| 280 STATS small_gap_stats(0, MAXSPACING - 1); | |
| 281 TBOX blob_box; | |
| 282 TBOX prev_blob_box; | |
| 283 int16_t gap_width; | |
| 284 int16_t real_space_threshold = 0; | |
| 285 int16_t max = 0; | |
| 286 int16_t large_gap_count = 0; | |
| 287 bool suspected_table; | |
| 288 bool good_block_space_estimate = block_space_gap_width > 0; | |
| 289 int32_t end_of_row; | |
| 290 int32_t row_length = 0; | |
| 291 float sane_space; | |
| 292 int32_t sane_threshold; | |
| 293 | |
| 294 /* Collect first pass stats for row */ | |
| 295 | |
| 296 if (!good_block_space_estimate) { | |
| 297 block_space_gap_width = int16_t(std::floor(row->xheight / 2)); | |
| 298 } | |
| 299 if (!row->blob_list()->empty()) { | |
| 300 if (tosp_threshold_bias1 > 0) { | |
| 301 real_space_threshold = | |
| 302 block_non_space_gap_width + | |
| 303 int16_t(floor(0.5 + tosp_threshold_bias1 * | |
| 304 (block_space_gap_width - block_non_space_gap_width))); | |
| 305 } else { | |
| 306 real_space_threshold = // Old TO method | |
| 307 (block_space_gap_width + block_non_space_gap_width) / 2; | |
| 308 } | |
| 309 blob_it.set_to_list(row->blob_list()); | |
| 310 blob_it.mark_cycle_pt(); | |
| 311 end_of_row = blob_it.data_relative(-1)->bounding_box().right(); | |
| 312 if (tosp_use_pre_chopping) { | |
| 313 blob_box = box_next_pre_chopped(&blob_it); | |
| 314 } else if (tosp_stats_use_xht_gaps) { | |
| 315 blob_box = reduced_box_next(row, &blob_it); | |
| 316 } else { | |
| 317 blob_box = box_next(&blob_it); | |
| 318 } | |
| 319 row_length = end_of_row - blob_box.left(); | |
| 320 prev_blob_box = blob_box; | |
| 321 while (!blob_it.cycled_list()) { | |
| 322 if (tosp_use_pre_chopping) { | |
| 323 blob_box = box_next_pre_chopped(&blob_it); | |
| 324 } else if (tosp_stats_use_xht_gaps) { | |
| 325 blob_box = reduced_box_next(row, &blob_it); | |
| 326 } else { | |
| 327 blob_box = box_next(&blob_it); | |
| 328 } | |
| 329 int16_t left = prev_blob_box.right(); | |
| 330 int16_t right = blob_box.left(); | |
| 331 gap_width = right - left; | |
| 332 if (ignore_big_gap(row, row_length, gapmap, left, right)) { | |
| 333 large_gap_count++; | |
| 334 } else { | |
| 335 if (gap_width >= real_space_threshold) { | |
| 336 if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || | |
| 337 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && | |
| 338 (!tosp_narrow_blobs_not_cert || | |
| 339 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || | |
| 340 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { | |
| 341 cert_space_gap_stats.add(gap_width, 1); | |
| 342 } | |
| 343 all_space_gap_stats.add(gap_width, 1); | |
| 344 } else { | |
| 345 small_gap_stats.add(gap_width, 1); | |
| 346 } | |
| 347 all_gap_stats.add(gap_width, 1); | |
| 348 } | |
| 349 prev_blob_box = blob_box; | |
| 350 } | |
| 351 } | |
| 352 suspected_table = (large_gap_count > 1) || | |
| 353 ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples)); | |
| 354 | |
| 355 /* Now determine row kern size, space size and threshold */ | |
| 356 | |
| 357 if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) || | |
| 358 ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) && | |
| 359 cert_space_gap_stats.get_total() > 0)) { | |
| 360 old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats, | |
| 361 block_space_gap_width, block_non_space_gap_width); | |
| 362 } else { | |
| 363 if (!tosp_recovery_isolated_row_stats || | |
| 364 !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) { | |
| 365 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) { | |
| 366 tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx); | |
| 367 } | |
| 368 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { | |
| 369 // Use block default | |
| 370 row->space_size = block_space_gap_width; | |
| 371 if (all_gap_stats.get_total() > tosp_redo_kern_limit) { | |
| 372 row->kern_size = all_gap_stats.median(); | |
| 373 } else { | |
| 374 row->kern_size = block_non_space_gap_width; | |
| 375 } | |
| 376 row->space_threshold = | |
| 377 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); | |
| 378 } else { | |
| 379 old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats, | |
| 380 block_space_gap_width, block_non_space_gap_width); | |
| 381 } | |
| 382 } | |
| 383 } | |
| 384 | |
| 385 if (tosp_improve_thresh && !suspected_table) { | |
| 386 improve_row_threshold(row, &all_gap_stats); | |
| 387 } | |
| 388 | |
| 389 /* Now lets try to be careful not to do anything silly with tables when we | |
| 390 are ignoring big gaps*/ | |
| 391 if (tosp_sanity_method == 0) { | |
| 392 if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { | |
| 393 if (tosp_debug_level > 5) { | |
| 394 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx, | |
| 395 row->kern_size, row->space_threshold, row->space_size); | |
| 396 } | |
| 397 row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size); | |
| 398 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight); | |
| 399 } | |
| 400 } else if (tosp_sanity_method == 1) { | |
| 401 sane_space = row->space_size; | |
| 402 /* NEVER let space size get too close to kern size */ | |
| 403 if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) || | |
| 404 ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) { | |
| 405 if (good_block_space_estimate && | |
| 406 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) { | |
| 407 sane_space = block_space_gap_width; | |
| 408 } else { | |
| 409 sane_space = | |
| 410 std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f), | |
| 411 row->xheight / 2.0f); | |
| 412 } | |
| 413 if (tosp_debug_level > 5) { | |
| 414 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx, | |
| 415 row->kern_size, row->space_threshold, row->space_size, sane_space); | |
| 416 } | |
| 417 row->space_size = sane_space; | |
| 418 row->space_threshold = | |
| 419 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); | |
| 420 } | |
| 421 /* NEVER let threshold get VERY far away from kern */ | |
| 422 sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f))); | |
| 423 if (row->space_threshold > sane_threshold) { | |
| 424 if (tosp_debug_level > 5) { | |
| 425 tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx, | |
| 426 row->kern_size, row->space_threshold, row->space_size, sane_threshold); | |
| 427 } | |
| 428 row->space_threshold = sane_threshold; | |
| 429 if (row->space_size <= sane_threshold) { | |
| 430 row->space_size = row->space_threshold + 1.0f; | |
| 431 } | |
| 432 } | |
| 433 /* Beware of tables - there may be NO spaces */ | |
| 434 if (suspected_table) { | |
| 435 sane_space = | |
| 436 std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight); | |
| 437 sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2)); | |
| 438 | |
| 439 if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) { | |
| 440 if (tosp_debug_level > 5) { | |
| 441 tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx, | |
| 442 row->kern_size, row->space_threshold, row->space_size); | |
| 443 } | |
| 444 // the minimum sane value | |
| 445 row->space_threshold = static_cast<int32_t>(sane_space); | |
| 446 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight); | |
| 447 } | |
| 448 } | |
| 449 } | |
| 450 | |
| 451 /* Now lets try to put some error limits on the threshold */ | |
| 452 | |
| 453 if (tosp_old_to_method) { | |
| 454 /* Old textord made a space if gap >= threshold */ | |
| 455 // NO FUZZY SPACES YET | |
| 456 row->max_nonspace = row->space_threshold; | |
| 457 // NO FUZZY SPACES YET | |
| 458 row->min_space = row->space_threshold + 1; | |
| 459 } else { | |
| 460 /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */ | |
| 461 row->min_space = | |
| 462 std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size)); | |
| 463 if (row->min_space <= row->space_threshold) { | |
| 464 // Don't be silly | |
| 465 row->min_space = row->space_threshold + 1; | |
| 466 } | |
| 467 /* | |
| 468 Lets try to guess the max certain kern gap by looking at the cluster of | |
| 469 kerns for the row. The row is proportional so the kerns should cluster | |
| 470 tightly at the bottom of the distribution. We also expect most gaps to be | |
| 471 kerns. Find the maximum of the kern piles between 0 and twice the kern | |
| 472 estimate. Piles before the first one with less than 1/10 the maximum | |
| 473 number of samples can be taken as certain kerns. | |
| 474 | |
| 475 Of course, there are some cases where the kern peak and space peaks merge, | |
| 476 so we will put an UPPER limit on the max certain kern gap of some fraction | |
| 477 below the threshold. | |
| 478 */ | |
| 479 | |
| 480 // upper bound | |
| 481 int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2); | |
| 482 | |
| 483 // default | |
| 484 row->max_nonspace = max_max_nonspace; | |
| 485 for (int32_t index = 0; index <= max_max_nonspace; index++) { | |
| 486 if (all_gap_stats.pile_count(index) > max) { | |
| 487 max = all_gap_stats.pile_count(index); | |
| 488 } | |
| 489 if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) { | |
| 490 row->max_nonspace = index; | |
| 491 break; | |
| 492 } | |
| 493 } | |
| 494 } | |
| 495 | |
| 496 /* Yet another algorithm - simpler this time - just choose a fraction of the | |
| 497 threshold to space range */ | |
| 498 | |
| 499 if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) { | |
| 500 row->min_space = std::max( | |
| 501 row->min_space, static_cast<int32_t>(ceil(row->space_threshold + | |
| 502 tosp_fuzzy_sp_fraction * | |
| 503 (row->space_size - row->space_threshold)))); | |
| 504 } | |
| 505 | |
| 506 /* Ensure that ANY space less than some multiplier times the kern size is | |
| 507 fuzzy. In tables there is a risk of erroneously setting a small space size | |
| 508 when there are no real spaces. Sometimes tables have text squashed into | |
| 509 columns so that the kn->sp ratio is small anyway - this means that we can't | |
| 510 use this to force a wider separation - hence we rely on context to join any | |
| 511 dubious breaks. */ | |
| 512 | |
| 513 if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) { | |
| 514 row->min_space = std::max( | |
| 515 row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size))); | |
| 516 } | |
| 517 | |
| 518 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) { | |
| 519 row->max_nonspace = static_cast<int32_t>(floor( | |
| 520 0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size))); | |
| 521 } | |
| 522 if (row->max_nonspace > row->space_threshold) { | |
| 523 // Don't be silly | |
| 524 row->max_nonspace = row->space_threshold; | |
| 525 } | |
| 526 | |
| 527 if (tosp_debug_level > 5) { | |
| 528 tprintf( | |
| 529 "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) " | |
| 530 "Sp:%3.2f\n", | |
| 531 block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width, | |
| 532 real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold, | |
| 533 row->min_space, row->space_size); | |
| 534 } | |
| 535 if (tosp_debug_level > 10) { | |
| 536 tprintf( | |
| 537 "row->kern_size = %3.2f, row->space_size = %3.2f, " | |
| 538 "row->space_threshold = %d\n", | |
| 539 row->kern_size, row->space_size, row->space_threshold); | |
| 540 } | |
| 541 } | |
| 542 | |
| 543 void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats, | |
| 544 STATS *small_gap_stats, | |
| 545 int16_t block_space_gap_width, // estimate for block | |
| 546 int16_t block_non_space_gap_width // estimate for block | |
| 547 ) { | |
| 548 /* First, estimate row space size */ | |
| 549 /* Old to condition was > 2 */ | |
| 550 if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) { | |
| 551 // Adequate samples | |
| 552 /* Set space size to median of spaces BUT limits it if it seems wildly out | |
| 553 */ | |
| 554 row->space_size = space_gap_stats->median(); | |
| 555 if (row->space_size > block_space_gap_width * 1.5) { | |
| 556 if (tosp_old_to_bug_fix) { | |
| 557 row->space_size = block_space_gap_width * 1.5; | |
| 558 } else { | |
| 559 // BUG??? should be *1.5 | |
| 560 row->space_size = block_space_gap_width; | |
| 561 } | |
| 562 } | |
| 563 if (row->space_size < (block_non_space_gap_width * 2) + 1) { | |
| 564 row->space_size = (block_non_space_gap_width * 2) + 1; | |
| 565 } | |
| 566 } | |
| 567 // Only 1 or 2 samples | |
| 568 else if (space_gap_stats->get_total() >= 1) { | |
| 569 // hence mean not median | |
| 570 row->space_size = space_gap_stats->mean(); | |
| 571 if (row->space_size > block_space_gap_width * 1.5) { | |
| 572 if (tosp_old_to_bug_fix) { | |
| 573 row->space_size = block_space_gap_width * 1.5; | |
| 574 } else { | |
| 575 // BUG??? should be *1.5 | |
| 576 row->space_size = block_space_gap_width; | |
| 577 } | |
| 578 } | |
| 579 if (row->space_size < (block_non_space_gap_width * 3) + 1) { | |
| 580 row->space_size = (block_non_space_gap_width * 3) + 1; | |
| 581 } | |
| 582 } else { | |
| 583 // Use block default | |
| 584 row->space_size = block_space_gap_width; | |
| 585 } | |
| 586 | |
| 587 /* Next, estimate row kern size */ | |
| 588 if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) { | |
| 589 row->kern_size = small_gap_stats->median(); | |
| 590 } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) { | |
| 591 row->kern_size = all_gap_stats->median(); | |
| 592 } else { // old TO -SAME FOR ALL ROWS | |
| 593 row->kern_size = block_non_space_gap_width; | |
| 594 } | |
| 595 | |
| 596 /* Finally, estimate row space threshold */ | |
| 597 if (tosp_threshold_bias2 > 0) { | |
| 598 row->space_threshold = int32_t( | |
| 599 floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size))); | |
| 600 } else { | |
| 601 /* | |
| 602 NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold | |
| 603 and holds this in a float. The use is with a >= test | |
| 604 NEW textord uses an integer threshold and a > test | |
| 605 It comes to the same thing. | |
| 606 (Though there is a difference in that old textor has integer space_size | |
| 607 and kern_size.) | |
| 608 */ | |
| 609 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2)); | |
| 610 } | |
| 611 | |
| 612 // Apply the same logic and ratios as in row_spacing_stats to | |
| 613 // restrict relative values of the row's space_size, kern_size, and | |
| 614 // space_threshold | |
| 615 if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 && | |
| 616 ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) || | |
| 617 ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) { | |
| 618 if (row->kern_size > 2.5) { | |
| 619 row->kern_size = row->space_size / tosp_min_sane_kn_sp; | |
| 620 } | |
| 621 row->space_threshold = | |
| 622 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); | |
| 623 } | |
| 624 } | |
| 625 | |
| 626 /************************************************************************* | |
| 627 * isolated_row_stats() | |
| 628 * Set values for min_space, max_non_space based on row stats only | |
| 629 *************************************************************************/ | |
| 630 bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, | |
| 631 bool suspected_table, int16_t block_idx, int16_t row_idx) { | |
| 632 float kern_estimate; | |
| 633 float crude_threshold_estimate; | |
| 634 int16_t small_gaps_count; | |
| 635 int16_t total; | |
| 636 // iterator | |
| 637 BLOBNBOX_IT blob_it = row->blob_list(); | |
| 638 STATS cert_space_gap_stats(0, MAXSPACING - 1); | |
| 639 STATS all_space_gap_stats(0, MAXSPACING - 1); | |
| 640 STATS small_gap_stats(0, MAXSPACING - 1); | |
| 641 TBOX blob_box; | |
| 642 TBOX prev_blob_box; | |
| 643 int16_t gap_width; | |
| 644 int32_t end_of_row; | |
| 645 int32_t row_length; | |
| 646 | |
| 647 kern_estimate = all_gap_stats->median(); | |
| 648 crude_threshold_estimate = | |
| 649 std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight); | |
| 650 small_gaps_count = | |
| 651 stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate))); | |
| 652 total = all_gap_stats->get_total(); | |
| 653 | |
| 654 if ((total <= tosp_redo_kern_limit) || | |
| 655 ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) || | |
| 656 (total - small_gaps_count < 1)) { | |
| 657 if (tosp_debug_level > 5) { | |
| 658 tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx); | |
| 659 } | |
| 660 return false; | |
| 661 } | |
| 662 blob_it.set_to_list(row->blob_list()); | |
| 663 blob_it.mark_cycle_pt(); | |
| 664 end_of_row = blob_it.data_relative(-1)->bounding_box().right(); | |
| 665 if (tosp_use_pre_chopping) { | |
| 666 blob_box = box_next_pre_chopped(&blob_it); | |
| 667 } else if (tosp_stats_use_xht_gaps) { | |
| 668 blob_box = reduced_box_next(row, &blob_it); | |
| 669 } else { | |
| 670 blob_box = box_next(&blob_it); | |
| 671 } | |
| 672 row_length = end_of_row - blob_box.left(); | |
| 673 prev_blob_box = blob_box; | |
| 674 while (!blob_it.cycled_list()) { | |
| 675 if (tosp_use_pre_chopping) { | |
| 676 blob_box = box_next_pre_chopped(&blob_it); | |
| 677 } else if (tosp_stats_use_xht_gaps) { | |
| 678 blob_box = reduced_box_next(row, &blob_it); | |
| 679 } else { | |
| 680 blob_box = box_next(&blob_it); | |
| 681 } | |
| 682 int16_t left = prev_blob_box.right(); | |
| 683 int16_t right = blob_box.left(); | |
| 684 gap_width = right - left; | |
| 685 if (!ignore_big_gap(row, row_length, gapmap, left, right) && | |
| 686 (gap_width > crude_threshold_estimate)) { | |
| 687 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) || | |
| 688 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && | |
| 689 (!tosp_narrow_blobs_not_cert || | |
| 690 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || | |
| 691 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { | |
| 692 cert_space_gap_stats.add(gap_width, 1); | |
| 693 } | |
| 694 all_space_gap_stats.add(gap_width, 1); | |
| 695 } | |
| 696 if (gap_width < crude_threshold_estimate) { | |
| 697 small_gap_stats.add(gap_width, 1); | |
| 698 } | |
| 699 | |
| 700 prev_blob_box = blob_box; | |
| 701 } | |
| 702 if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { | |
| 703 // median | |
| 704 row->space_size = cert_space_gap_stats.median(); | |
| 705 } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) { | |
| 706 // to avoid spaced | |
| 707 row->space_size = cert_space_gap_stats.mean(); | |
| 708 // 1's in tables | |
| 709 } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { | |
| 710 // median | |
| 711 row->space_size = all_space_gap_stats.median(); | |
| 712 } else { | |
| 713 row->space_size = all_space_gap_stats.mean(); | |
| 714 } | |
| 715 | |
| 716 if (tosp_only_small_gaps_for_kern) { | |
| 717 row->kern_size = small_gap_stats.median(); | |
| 718 } else { | |
| 719 row->kern_size = all_gap_stats->median(); | |
| 720 } | |
| 721 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2)); | |
| 722 /* Sanity check */ | |
| 723 if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) || | |
| 724 (row->space_threshold <= 0)) { | |
| 725 if (tosp_debug_level > 5) { | |
| 726 tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx, | |
| 727 row->kern_size, row->space_threshold, row->space_size); | |
| 728 } | |
| 729 row->kern_size = 0.0f; | |
| 730 row->space_threshold = 0; | |
| 731 row->space_size = 0.0f; | |
| 732 return false; | |
| 733 } | |
| 734 | |
| 735 if (tosp_debug_level > 5) { | |
| 736 tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size, | |
| 737 row->space_threshold, row->space_size); | |
| 738 } | |
| 739 return true; | |
| 740 } | |
| 741 | |
| 742 int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) { | |
| 743 int16_t index; | |
| 744 int16_t total = 0; | |
| 745 | |
| 746 for (index = 0; index < threshold; index++) { | |
| 747 total += stats->pile_count(index); | |
| 748 } | |
| 749 return total; | |
| 750 } | |
| 751 | |
| 752 /************************************************************************* | |
| 753 * improve_row_threshold() | |
| 754 * Try to recognise a "normal line" - | |
| 755 * > 25 gaps | |
| 756 * && space > 3 * kn && space > 10 | |
| 757 * (I.e. reasonably large space and kn:sp ratio) | |
| 758 * && > 3/4 # gaps < kn + (sp - kn)/3 | |
| 759 * (I.e. most gaps are well away from space estimate) | |
| 760 * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found | |
| 761 * somewhere in the histogram between kn and sp | |
| 762 * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies | |
| 763 * NO!!!!! the bristol line has "11" with a gap of 12 between the | |
| 764 *1's!!! try moving the default threshold to within this band but leave the | |
| 765 * fuzzy limit calculation as at present. | |
| 766 *************************************************************************/ | |
| 767 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { | |
| 768 float sp = row->space_size; | |
| 769 float kn = row->kern_size; | |
| 770 int16_t reqd_zero_width = 0; | |
| 771 int16_t zero_width = 0; | |
| 772 int16_t zero_start = 0; | |
| 773 int16_t index = 0; | |
| 774 | |
| 775 if (tosp_debug_level > 10) { | |
| 776 tprintf("Improve row threshold 0"); | |
| 777 } | |
| 778 if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) || | |
| 779 (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) < | |
| 780 (0.75 * all_gap_stats->get_total()))) { | |
| 781 return; | |
| 782 } | |
| 783 if (tosp_debug_level > 10) { | |
| 784 tprintf(" 1"); | |
| 785 } | |
| 786 /* | |
| 787 Look for the first region of all 0's in the histogram which is wider than | |
| 788 max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current | |
| 789 threshold is not within it, move the threshold so that is just inside it. | |
| 790 */ | |
| 791 reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5)); | |
| 792 if (reqd_zero_width < 3) { | |
| 793 reqd_zero_width = 3; | |
| 794 } | |
| 795 | |
| 796 for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) { | |
| 797 if (all_gap_stats->pile_count(index) == 0) { | |
| 798 if (zero_width == 0) { | |
| 799 zero_start = index; | |
| 800 } | |
| 801 zero_width++; | |
| 802 } else { | |
| 803 if (zero_width >= reqd_zero_width) { | |
| 804 break; | |
| 805 } else { | |
| 806 zero_width = 0; | |
| 807 } | |
| 808 } | |
| 809 } | |
| 810 index--; | |
| 811 if (tosp_debug_level > 10) { | |
| 812 tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width, | |
| 813 zero_width, zero_start, row->space_threshold); | |
| 814 } | |
| 815 if ((zero_width < reqd_zero_width) || | |
| 816 ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) { | |
| 817 return; | |
| 818 } | |
| 819 if (tosp_debug_level > 10) { | |
| 820 tprintf(" 2"); | |
| 821 } | |
| 822 if (row->space_threshold < zero_start) { | |
| 823 if (tosp_debug_level > 5) { | |
| 824 tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, | |
| 825 index, row->space_threshold, zero_start); | |
| 826 } | |
| 827 row->space_threshold = zero_start; | |
| 828 } | |
| 829 if (row->space_threshold > index) { | |
| 830 if (tosp_debug_level > 5) { | |
| 831 tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, | |
| 832 index, row->space_threshold, index); | |
| 833 } | |
| 834 row->space_threshold = index; | |
| 835 } | |
| 836 } | |
| 837 | |
| 838 /********************************************************************** | |
| 839 * make_prop_words | |
| 840 * | |
| 841 * Convert a TO_ROW to a ROW. | |
| 842 **********************************************************************/ | |
| 843 ROW *Textord::make_prop_words(TO_ROW *row, // row to make | |
| 844 FCOORD rotation // for drawing | |
| 845 ) { | |
| 846 bool bol; // start of line | |
| 847 /* prev_ values are for start of word being built. non prev_ values are for | |
| 848 the gap between the word being built and the next one. */ | |
| 849 bool prev_fuzzy_sp; // probably space | |
| 850 bool prev_fuzzy_non; // probably not | |
| 851 uint8_t prev_blanks; // in front of word | |
| 852 bool fuzzy_sp = false; // probably space | |
| 853 bool fuzzy_non = false; // probably not | |
| 854 uint8_t blanks = 0; // in front of word | |
| 855 bool prev_gap_was_a_space = false; | |
| 856 bool break_at_next_gap = false; | |
| 857 ROW *real_row; // output row | |
| 858 C_OUTLINE_IT cout_it; | |
| 859 C_BLOB_LIST cblobs; | |
| 860 C_BLOB_IT cblob_it = &cblobs; | |
| 861 WERD_LIST words; | |
| 862 WERD *word; // new word | |
| 863 int32_t next_rep_char_word_right = INT32_MAX; | |
| 864 float repetition_spacing; // gap between repetitions | |
| 865 int32_t xstarts[2]; // row ends | |
| 866 int32_t prev_x; // end of prev blob | |
| 867 BLOBNBOX_IT box_it; // iterator | |
| 868 TBOX prev_blob_box; | |
| 869 TBOX next_blob_box; | |
| 870 int16_t prev_gap = INT16_MAX; | |
| 871 int16_t current_gap = INT16_MAX; | |
| 872 int16_t next_gap = INT16_MAX; | |
| 873 int16_t prev_within_xht_gap = INT16_MAX; | |
| 874 int16_t current_within_xht_gap = INT16_MAX; | |
| 875 int16_t next_within_xht_gap = INT16_MAX; | |
| 876 int16_t word_count = 0; | |
| 877 | |
| 878 // repeated char words | |
| 879 WERD_IT rep_char_it(&(row->rep_words)); | |
| 880 if (!rep_char_it.empty()) { | |
| 881 next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); | |
| 882 } | |
| 883 | |
| 884 prev_x = -INT16_MAX; | |
| 885 cblob_it.set_to_list(&cblobs); | |
| 886 box_it.set_to_list(row->blob_list()); | |
| 887 // new words | |
| 888 WERD_IT word_it(&words); | |
| 889 bol = true; | |
| 890 prev_blanks = 0; | |
| 891 prev_fuzzy_sp = false; | |
| 892 prev_fuzzy_non = false; | |
| 893 if (!box_it.empty()) { | |
| 894 xstarts[0] = box_it.data()->bounding_box().left(); | |
| 895 if (xstarts[0] > next_rep_char_word_right) { | |
| 896 /* We need to insert a repeated char word at the start of the row */ | |
| 897 word = rep_char_it.extract(); | |
| 898 word_it.add_after_then_move(word); | |
| 899 /* Set spaces before repeated char word */ | |
| 900 word->set_flag(W_BOL, true); | |
| 901 bol = false; | |
| 902 word->set_blanks(0); | |
| 903 // NO uncertainty | |
| 904 word->set_flag(W_FUZZY_SP, false); | |
| 905 word->set_flag(W_FUZZY_NON, false); | |
| 906 xstarts[0] = word->bounding_box().left(); | |
| 907 /* Set spaces after repeated char word (and leave current word set) */ | |
| 908 repetition_spacing = find_mean_blob_spacing(word); | |
| 909 current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right; | |
| 910 current_within_xht_gap = current_gap; | |
| 911 if (current_gap > tosp_rep_space * repetition_spacing) { | |
| 912 prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); | |
| 913 if (prev_blanks < 1) { | |
| 914 prev_blanks = 1; | |
| 915 } | |
| 916 } else { | |
| 917 prev_blanks = 0; | |
| 918 } | |
| 919 if (tosp_debug_level > 5) { | |
| 920 tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", | |
| 921 box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(), | |
| 922 repetition_spacing, current_gap); | |
| 923 } | |
| 924 prev_fuzzy_sp = false; | |
| 925 prev_fuzzy_non = false; | |
| 926 if (rep_char_it.empty()) { | |
| 927 next_rep_char_word_right = INT32_MAX; | |
| 928 } else { | |
| 929 rep_char_it.forward(); | |
| 930 next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); | |
| 931 } | |
| 932 } | |
| 933 | |
| 934 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); | |
| 935 do { | |
| 936 auto bblob = box_it.data(); | |
| 937 auto blob_box = bblob->bounding_box(); | |
| 938 if (bblob->joined_to_prev()) { | |
| 939 auto cblob = bblob->remove_cblob(); | |
| 940 if (cblob != nullptr) { | |
| 941 cout_it.set_to_list(cblob_it.data()->out_list()); | |
| 942 cout_it.move_to_last(); | |
| 943 cout_it.add_list_after(cblob->out_list()); | |
| 944 delete cblob; | |
| 945 } | |
| 946 } else { | |
| 947 auto cblob = bblob->cblob(); | |
| 948 if (cblob != nullptr) { | |
| 949 bblob->set_owns_cblob(false); | |
| 950 cblob_it.add_after_then_move(cblob); | |
| 951 } | |
| 952 prev_x = blob_box.right(); | |
| 953 } | |
| 954 box_it.forward(); // next one | |
| 955 bblob = box_it.data(); | |
| 956 blob_box = bblob->bounding_box(); | |
| 957 | |
| 958 if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) { | |
| 959 /* Real Blob - not multiple outlines or pre-chopped */ | |
| 960 prev_gap = current_gap; | |
| 961 prev_within_xht_gap = current_within_xht_gap; | |
| 962 prev_blob_box = next_blob_box; | |
| 963 current_gap = next_gap; | |
| 964 current_within_xht_gap = next_within_xht_gap; | |
| 965 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); | |
| 966 | |
| 967 int16_t prev_gap_arg = prev_gap; | |
| 968 int16_t next_gap_arg = next_gap; | |
| 969 if (tosp_only_use_xht_gaps) { | |
| 970 prev_gap_arg = prev_within_xht_gap; | |
| 971 next_gap_arg = next_within_xht_gap; | |
| 972 } | |
| 973 // Decide if a word-break should be inserted | |
| 974 if (blob_box.left() > next_rep_char_word_right || | |
| 975 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap, | |
| 976 current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp, | |
| 977 fuzzy_non, prev_gap_was_a_space, break_at_next_gap) || | |
| 978 box_it.at_first()) { | |
| 979 /* Form a new word out of the blobs collected */ | |
| 980 word = new WERD(&cblobs, prev_blanks, nullptr); | |
| 981 word_count++; | |
| 982 word_it.add_after_then_move(word); | |
| 983 if (bol) { | |
| 984 word->set_flag(W_BOL, true); | |
| 985 bol = false; | |
| 986 } | |
| 987 if (prev_fuzzy_sp) { | |
| 988 // probably space | |
| 989 word->set_flag(W_FUZZY_SP, true); | |
| 990 } else if (prev_fuzzy_non) { | |
| 991 word->set_flag(W_FUZZY_NON, true); | |
| 992 } | |
| 993 // probably not | |
| 994 | |
| 995 if (blob_box.left() > next_rep_char_word_right) { | |
| 996 /* We need to insert a repeated char word */ | |
| 997 word = rep_char_it.extract(); | |
| 998 word_it.add_after_then_move(word); | |
| 999 | |
| 1000 /* Set spaces before repeated char word */ | |
| 1001 repetition_spacing = find_mean_blob_spacing(word); | |
| 1002 current_gap = word->bounding_box().left() - prev_x; | |
| 1003 current_within_xht_gap = current_gap; | |
| 1004 if (current_gap > tosp_rep_space * repetition_spacing) { | |
| 1005 blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); | |
| 1006 if (blanks < 1) { | |
| 1007 blanks = 1; | |
| 1008 } | |
| 1009 } else { | |
| 1010 blanks = 0; | |
| 1011 } | |
| 1012 if (tosp_debug_level > 5) { | |
| 1013 tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);", | |
| 1014 word->bounding_box().left(), word->bounding_box().bottom(), | |
| 1015 repetition_spacing, current_gap, blanks); | |
| 1016 } | |
| 1017 word->set_blanks(blanks); | |
| 1018 // NO uncertainty | |
| 1019 word->set_flag(W_FUZZY_SP, false); | |
| 1020 word->set_flag(W_FUZZY_NON, false); | |
| 1021 | |
| 1022 /* Set spaces after repeated char word (and leave current word set) | |
| 1023 */ | |
| 1024 current_gap = blob_box.left() - next_rep_char_word_right; | |
| 1025 if (current_gap > tosp_rep_space * repetition_spacing) { | |
| 1026 blanks = static_cast<uint8_t>(current_gap / row->space_size); | |
| 1027 if (blanks < 1) { | |
| 1028 blanks = 1; | |
| 1029 } | |
| 1030 } else { | |
| 1031 blanks = 0; | |
| 1032 } | |
| 1033 if (tosp_debug_level > 5) { | |
| 1034 tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks); | |
| 1035 } | |
| 1036 fuzzy_sp = false; | |
| 1037 fuzzy_non = false; | |
| 1038 | |
| 1039 if (rep_char_it.empty()) { | |
| 1040 next_rep_char_word_right = INT32_MAX; | |
| 1041 } else { | |
| 1042 rep_char_it.forward(); | |
| 1043 next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); | |
| 1044 } | |
| 1045 } | |
| 1046 | |
| 1047 if (box_it.at_first() && rep_char_it.empty()) { | |
| 1048 // at end of line | |
| 1049 word->set_flag(W_EOL, true); | |
| 1050 xstarts[1] = prev_x; | |
| 1051 } else { | |
| 1052 prev_blanks = blanks; | |
| 1053 prev_fuzzy_sp = fuzzy_sp; | |
| 1054 prev_fuzzy_non = fuzzy_non; | |
| 1055 } | |
| 1056 } | |
| 1057 } | |
| 1058 } while (!box_it.at_first()); // until back at start | |
| 1059 | |
| 1060 /* Insert any further repeated char words */ | |
| 1061 while (!rep_char_it.empty()) { | |
| 1062 word = rep_char_it.extract(); | |
| 1063 word_it.add_after_then_move(word); | |
| 1064 | |
| 1065 /* Set spaces before repeated char word */ | |
| 1066 repetition_spacing = find_mean_blob_spacing(word); | |
| 1067 current_gap = word->bounding_box().left() - prev_x; | |
| 1068 if (current_gap > tosp_rep_space * repetition_spacing) { | |
| 1069 blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); | |
| 1070 if (blanks < 1) { | |
| 1071 blanks = 1; | |
| 1072 } | |
| 1073 } else { | |
| 1074 blanks = 0; | |
| 1075 } | |
| 1076 if (tosp_debug_level > 5) { | |
| 1077 tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n", | |
| 1078 word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing, | |
| 1079 current_gap, blanks); | |
| 1080 } | |
| 1081 word->set_blanks(blanks); | |
| 1082 // NO uncertainty | |
| 1083 word->set_flag(W_FUZZY_SP, false); | |
| 1084 word->set_flag(W_FUZZY_NON, false); | |
| 1085 prev_x = word->bounding_box().right(); | |
| 1086 if (rep_char_it.empty()) { | |
| 1087 // at end of line | |
| 1088 word->set_flag(W_EOL, true); | |
| 1089 xstarts[1] = prev_x; | |
| 1090 } else { | |
| 1091 rep_char_it.forward(); | |
| 1092 } | |
| 1093 } | |
| 1094 real_row = | |
| 1095 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); | |
| 1096 word_it.set_to_list(real_row->word_list()); | |
| 1097 // put words in row | |
| 1098 word_it.add_list_after(&words); | |
| 1099 real_row->recalc_bounding_box(); | |
| 1100 | |
| 1101 if (tosp_debug_level > 4) { | |
| 1102 tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count, | |
| 1103 real_row->bounding_box().left(), real_row->bounding_box().bottom(), | |
| 1104 real_row->bounding_box().right(), real_row->bounding_box().top()); | |
| 1105 } | |
| 1106 return real_row; | |
| 1107 } | |
| 1108 return nullptr; | |
| 1109 } | |
| 1110 | |
| 1111 /********************************************************************** | |
| 1112 * make_blob_words | |
| 1113 * | |
| 1114 * Converts words into blobs so that each blob is a single character. | |
| 1115 * Used for chopper test. | |
| 1116 **********************************************************************/ | |
| 1117 ROW *Textord::make_blob_words(TO_ROW *row, // row to make | |
| 1118 FCOORD rotation // for drawing | |
| 1119 ) { | |
| 1120 bool bol; // start of line | |
| 1121 ROW *real_row; // output row | |
| 1122 C_OUTLINE_IT cout_it; | |
| 1123 C_BLOB_LIST cblobs; | |
| 1124 C_BLOB_IT cblob_it = &cblobs; | |
| 1125 WERD_LIST words; | |
| 1126 WERD *word; // new word | |
| 1127 BLOBNBOX_IT box_it; // iterator | |
| 1128 int16_t word_count = 0; | |
| 1129 | |
| 1130 cblob_it.set_to_list(&cblobs); | |
| 1131 box_it.set_to_list(row->blob_list()); | |
| 1132 // new words | |
| 1133 WERD_IT word_it(&words); | |
| 1134 bol = true; | |
| 1135 if (!box_it.empty()) { | |
| 1136 do { | |
| 1137 auto bblob = box_it.data(); | |
| 1138 auto blob_box = bblob->bounding_box(); | |
| 1139 if (bblob->joined_to_prev()) { | |
| 1140 auto cblob = bblob->remove_cblob(); | |
| 1141 if (cblob != nullptr) { | |
| 1142 cout_it.set_to_list(cblob_it.data()->out_list()); | |
| 1143 cout_it.move_to_last(); | |
| 1144 cout_it.add_list_after(cblob->out_list()); | |
| 1145 delete cblob; | |
| 1146 } | |
| 1147 } else { | |
| 1148 auto cblob = bblob->cblob(); | |
| 1149 if (cblob != nullptr) { | |
| 1150 bblob->set_owns_cblob(false); | |
| 1151 cblob_it.add_after_then_move(cblob); | |
| 1152 } | |
| 1153 } | |
| 1154 box_it.forward(); // next one | |
| 1155 bblob = box_it.data(); | |
| 1156 blob_box = bblob->bounding_box(); | |
| 1157 | |
| 1158 if (!bblob->joined_to_prev() && !cblobs.empty()) { | |
| 1159 word = new WERD(&cblobs, 1, nullptr); | |
| 1160 word_count++; | |
| 1161 word_it.add_after_then_move(word); | |
| 1162 if (bol) { | |
| 1163 word->set_flag(W_BOL, true); | |
| 1164 bol = false; | |
| 1165 } | |
| 1166 if (box_it.at_first()) { // at end of line | |
| 1167 word->set_flag(W_EOL, true); | |
| 1168 } | |
| 1169 } | |
| 1170 } while (!box_it.at_first()); // until back at start | |
| 1171 /* Setup the row with created words. */ | |
| 1172 real_row = | |
| 1173 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); | |
| 1174 word_it.set_to_list(real_row->word_list()); | |
| 1175 // put words in row | |
| 1176 word_it.add_list_after(&words); | |
| 1177 real_row->recalc_bounding_box(); | |
| 1178 if (tosp_debug_level > 4) { | |
| 1179 tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count, | |
| 1180 real_row->bounding_box().left(), real_row->bounding_box().bottom(), | |
| 1181 real_row->bounding_box().right(), real_row->bounding_box().top()); | |
| 1182 } | |
| 1183 return real_row; | |
| 1184 } | |
| 1185 return nullptr; | |
| 1186 } | |
| 1187 | |
| 1188 bool Textord::make_a_word_break(TO_ROW *row, // row being made | |
| 1189 TBOX blob_box, // for next_blob // how many blanks? | |
| 1190 int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap, | |
| 1191 int16_t within_xht_current_gap, TBOX next_blob_box, | |
| 1192 int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non, | |
| 1193 bool &prev_gap_was_a_space, bool &break_at_next_gap) { | |
| 1194 bool space; | |
| 1195 int16_t current_gap; | |
| 1196 float fuzzy_sp_to_kn_limit; | |
| 1197 | |
| 1198 if (break_at_next_gap) { | |
| 1199 break_at_next_gap = false; | |
| 1200 return true; | |
| 1201 } | |
| 1202 /* Inhibit using the reduced gap if | |
| 1203 The kerning is large - chars are not kerned and reducing "f"s can cause | |
| 1204 erroneous blanks | |
| 1205 OR The real gap is less than 0 | |
| 1206 OR The real gap is less than the kerning estimate | |
| 1207 */ | |
| 1208 if ((row->kern_size > tosp_large_kerning * row->xheight) || | |
| 1209 ((tosp_dont_fool_with_small_kerns >= 0) && | |
| 1210 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) { | |
| 1211 // Ignore the difference | |
| 1212 within_xht_current_gap = real_current_gap; | |
| 1213 } | |
| 1214 | |
| 1215 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) { | |
| 1216 current_gap = within_xht_current_gap; | |
| 1217 } else { | |
| 1218 current_gap = real_current_gap; | |
| 1219 } | |
| 1220 | |
| 1221 if (tosp_old_to_method) { | |
| 1222 // Boring old method | |
| 1223 space = current_gap > row->max_nonspace; | |
| 1224 if (space && (current_gap < INT16_MAX)) { | |
| 1225 if (current_gap < row->min_space) { | |
| 1226 if (current_gap > row->space_threshold) { | |
| 1227 blanks = 1; | |
| 1228 fuzzy_sp = true; | |
| 1229 fuzzy_non = false; | |
| 1230 } else { | |
| 1231 blanks = 0; | |
| 1232 fuzzy_sp = false; | |
| 1233 fuzzy_non = true; | |
| 1234 } | |
| 1235 } else { | |
| 1236 if (row->space_size == 0.0f) { | |
| 1237 // Avoid FP division by 0. | |
| 1238 blanks = 1; | |
| 1239 } else { | |
| 1240 blanks = static_cast<uint8_t>(current_gap / row->space_size); | |
| 1241 if (blanks < 1) { | |
| 1242 blanks = 1; | |
| 1243 } | |
| 1244 } | |
| 1245 fuzzy_sp = false; | |
| 1246 fuzzy_non = false; | |
| 1247 } | |
| 1248 } | |
| 1249 return space; | |
| 1250 } else { | |
| 1251 /* New exciting heuristic method */ | |
| 1252 if (prev_blob_box.null_box()) { // Beginning of row | |
| 1253 prev_gap_was_a_space = true; | |
| 1254 } | |
| 1255 | |
| 1256 // Default as old TO | |
| 1257 space = current_gap > row->space_threshold; | |
| 1258 | |
| 1259 /* Set defaults for the word break in case we find one. Currently there are | |
| 1260 no fuzzy spaces. Depending on the reliability of the different heuristics | |
| 1261 we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY | |
| 1262 be used if the function returns true - ie the word is to be broken. | |
| 1263 */ | |
| 1264 int num_blanks = current_gap; | |
| 1265 if (row->space_size > 1.0f) { | |
| 1266 num_blanks = IntCastRounded(current_gap / row->space_size); | |
| 1267 } | |
| 1268 blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX)); | |
| 1269 fuzzy_sp = false; | |
| 1270 fuzzy_non = false; | |
| 1271 /* | |
| 1272 If xht measure causes gap to flip one of the 3 thresholds act accordingly - | |
| 1273 despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to | |
| 1274 context. | |
| 1275 */ | |
| 1276 if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) && | |
| 1277 (within_xht_current_gap > row->max_nonspace)) { | |
| 1278 space = true; | |
| 1279 fuzzy_non = true; | |
| 1280 #ifndef GRAPHICS_DISABLED | |
| 1281 mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1282 next_gap); | |
| 1283 #endif | |
| 1284 } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) && | |
| 1285 (within_xht_current_gap > row->space_threshold)) { | |
| 1286 space = true; | |
| 1287 if (tosp_flip_fuzz_kn_to_sp) { | |
| 1288 fuzzy_sp = true; | |
| 1289 } else { | |
| 1290 fuzzy_non = true; | |
| 1291 } | |
| 1292 #ifndef GRAPHICS_DISABLED | |
| 1293 mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1294 next_gap); | |
| 1295 #endif | |
| 1296 } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) && | |
| 1297 (within_xht_current_gap >= row->min_space)) { | |
| 1298 space = true; | |
| 1299 #ifndef GRAPHICS_DISABLED | |
| 1300 mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1301 next_gap); | |
| 1302 #endif | |
| 1303 } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) && | |
| 1304 suspected_punct_blob(row, blob_box)) { | |
| 1305 break_at_next_gap = true; | |
| 1306 } | |
| 1307 /* Now continue with normal heuristics */ | |
| 1308 else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) { | |
| 1309 /* Heuristics to turn dubious spaces to kerns */ | |
| 1310 if (tosp_pass_wide_fuzz_sp_to_context > 0) { | |
| 1311 fuzzy_sp_to_kn_limit = | |
| 1312 row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size); | |
| 1313 } else { | |
| 1314 fuzzy_sp_to_kn_limit = 99999.0f; | |
| 1315 } | |
| 1316 | |
| 1317 /* If current gap is significantly smaller than the previous space the | |
| 1318 other side of a narrow blob then this gap is a kern. */ | |
| 1319 if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space && | |
| 1320 (current_gap <= tosp_gap_factor * prev_gap)) { | |
| 1321 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | |
| 1322 if (tosp_flip_fuzz_sp_to_kn) { | |
| 1323 fuzzy_non = true; | |
| 1324 } else { | |
| 1325 fuzzy_sp = true; | |
| 1326 } | |
| 1327 } else { | |
| 1328 space = false; | |
| 1329 } | |
| 1330 #ifndef GRAPHICS_DISABLED | |
| 1331 mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1332 next_gap); | |
| 1333 #endif | |
| 1334 } | |
| 1335 /* If current gap not much bigger than the previous kern the other side of | |
| 1336 a narrow blob then this gap is a kern as well */ | |
| 1337 else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && | |
| 1338 !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) { | |
| 1339 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | |
| 1340 if (tosp_flip_fuzz_sp_to_kn) { | |
| 1341 fuzzy_non = true; | |
| 1342 } else { | |
| 1343 fuzzy_sp = true; | |
| 1344 } | |
| 1345 } else { | |
| 1346 space = false; | |
| 1347 } | |
| 1348 #ifndef GRAPHICS_DISABLED | |
| 1349 mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1350 next_gap); | |
| 1351 #endif | |
| 1352 } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) && | |
| 1353 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) { | |
| 1354 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | |
| 1355 if (tosp_flip_fuzz_sp_to_kn) { | |
| 1356 fuzzy_non = true; | |
| 1357 } else { | |
| 1358 fuzzy_sp = true; | |
| 1359 } | |
| 1360 } else { | |
| 1361 space = false; | |
| 1362 } | |
| 1363 #ifndef GRAPHICS_DISABLED | |
| 1364 mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1365 next_gap); | |
| 1366 #endif | |
| 1367 } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) && | |
| 1368 (next_gap <= row->space_threshold) && | |
| 1369 (current_gap * tosp_gap_factor <= next_gap)) { | |
| 1370 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | |
| 1371 if (tosp_flip_fuzz_sp_to_kn) { | |
| 1372 fuzzy_non = true; | |
| 1373 } else { | |
| 1374 fuzzy_sp = true; | |
| 1375 } | |
| 1376 } else { | |
| 1377 space = false; | |
| 1378 } | |
| 1379 #ifndef GRAPHICS_DISABLED | |
| 1380 mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1381 next_gap); | |
| 1382 #endif | |
| 1383 } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) || | |
| 1384 ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) { | |
| 1385 fuzzy_sp = true; | |
| 1386 #ifndef GRAPHICS_DISABLED | |
| 1387 mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1388 next_gap); | |
| 1389 #endif | |
| 1390 } | |
| 1391 } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) { | |
| 1392 /* Heuristics to turn dubious kerns to spaces */ | |
| 1393 /* TRIED THIS BUT IT MADE THINGS WORSE | |
| 1394 if (prev_gap == INT16_MAX) | |
| 1395 prev_gap = 0; // start of row | |
| 1396 if (next_gap == INT16_MAX) | |
| 1397 next_gap = 0; // end of row | |
| 1398 */ | |
| 1399 if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) && | |
| 1400 (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) && | |
| 1401 wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) { | |
| 1402 space = true; | |
| 1403 /* | |
| 1404 tosp_flip_caution is an attempt to stop the default changing in cases | |
| 1405 where there is a large difference between the kern and space estimates. | |
| 1406 See problem in 'chiefs' where "have" gets split in the quotation. | |
| 1407 */ | |
| 1408 if ((tosp_flip_fuzz_kn_to_sp) && | |
| 1409 ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) { | |
| 1410 fuzzy_sp = true; | |
| 1411 } else { | |
| 1412 fuzzy_non = true; | |
| 1413 } | |
| 1414 #ifndef GRAPHICS_DISABLED | |
| 1415 mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1416 next_gap); | |
| 1417 #endif | |
| 1418 } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 && | |
| 1419 current_gap > 5 && // Rule 9 handles small gap, big ratio. | |
| 1420 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) && | |
| 1421 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) && | |
| 1422 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) { | |
| 1423 space = true; | |
| 1424 fuzzy_non = true; | |
| 1425 #ifndef GRAPHICS_DISABLED | |
| 1426 mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1427 next_gap); | |
| 1428 #endif | |
| 1429 } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) && | |
| 1430 (next_blob_box.width() > 0) && | |
| 1431 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) && | |
| 1432 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) && | |
| 1433 !suspected_punct_blob(row, next_blob_box)))) { | |
| 1434 space = true; | |
| 1435 fuzzy_non = true; | |
| 1436 #ifndef GRAPHICS_DISABLED | |
| 1437 mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), | |
| 1438 next_gap); | |
| 1439 #endif | |
| 1440 } | |
| 1441 } | |
| 1442 if (tosp_debug_level > 10) { | |
| 1443 tprintf( | |
| 1444 "word break = %d current_gap = %d, prev_gap = %d, " | |
| 1445 "next_gap = %d\n", | |
| 1446 space ? 1 : 0, current_gap, prev_gap, next_gap); | |
| 1447 } | |
| 1448 prev_gap_was_a_space = space && !(fuzzy_non); | |
| 1449 return space; | |
| 1450 } | |
| 1451 } | |
| 1452 | |
| 1453 bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) { | |
| 1454 bool result; | |
| 1455 result = | |
| 1456 ((blob_box.width() <= tosp_narrow_fraction * row->xheight) || | |
| 1457 ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio)); | |
| 1458 return result; | |
| 1459 } | |
| 1460 | |
| 1461 bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) { | |
| 1462 bool result; | |
| 1463 if (tosp_wide_fraction > 0) { | |
| 1464 if (tosp_wide_aspect_ratio > 0) { | |
| 1465 result = | |
| 1466 ((blob_box.width() >= tosp_wide_fraction * row->xheight) && | |
| 1467 ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio)); | |
| 1468 } else { | |
| 1469 result = (blob_box.width() >= tosp_wide_fraction * row->xheight); | |
| 1470 } | |
| 1471 } else { | |
| 1472 result = !narrow_blob(row, blob_box); | |
| 1473 } | |
| 1474 return result; | |
| 1475 } | |
| 1476 | |
| 1477 bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) { | |
| 1478 bool result; | |
| 1479 float baseline; | |
| 1480 float blob_x_centre; | |
| 1481 /* Find baseline of centre of blob */ | |
| 1482 blob_x_centre = (box.right() + box.left()) / 2.0; | |
| 1483 baseline = row->baseline.y(blob_x_centre); | |
| 1484 | |
| 1485 result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) || | |
| 1486 (box.bottom() > baseline + row->xheight / 2.0); | |
| 1487 return result; | |
| 1488 } | |
| 1489 | |
| 1490 void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box, | |
| 1491 int16_t &next_gap, int16_t &next_within_xht_gap) { | |
| 1492 TBOX next_reduced_blob_box; | |
| 1493 TBOX bit_beyond; | |
| 1494 BLOBNBOX_IT reduced_box_it = box_it; | |
| 1495 | |
| 1496 next_blob_box = box_next(&box_it); | |
| 1497 next_reduced_blob_box = reduced_box_next(row, &reduced_box_it); | |
| 1498 if (box_it.at_first()) { | |
| 1499 next_gap = INT16_MAX; | |
| 1500 next_within_xht_gap = INT16_MAX; | |
| 1501 } else { | |
| 1502 bit_beyond = box_it.data()->bounding_box(); | |
| 1503 next_gap = bit_beyond.left() - next_blob_box.right(); | |
| 1504 bit_beyond = reduced_box_next(row, &reduced_box_it); | |
| 1505 next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right(); | |
| 1506 } | |
| 1507 } | |
| 1508 | |
| 1509 #ifndef GRAPHICS_DISABLED | |
| 1510 void Textord::mark_gap(TBOX blob, // blob following gap | |
| 1511 int16_t rule, // heuristic id | |
| 1512 int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap, | |
| 1513 int16_t next_blob_width, int16_t next_gap) { | |
| 1514 ScrollView::Color col; // of ellipse marking flipped gap | |
| 1515 | |
| 1516 switch (rule) { | |
| 1517 case 1: | |
| 1518 col = ScrollView::RED; | |
| 1519 break; | |
| 1520 case 2: | |
| 1521 col = ScrollView::CYAN; | |
| 1522 break; | |
| 1523 case 3: | |
| 1524 col = ScrollView::GREEN; | |
| 1525 break; | |
| 1526 case 4: | |
| 1527 col = ScrollView::BLACK; | |
| 1528 break; | |
| 1529 case 5: | |
| 1530 col = ScrollView::MAGENTA; | |
| 1531 break; | |
| 1532 case 6: | |
| 1533 col = ScrollView::BLUE; | |
| 1534 break; | |
| 1535 | |
| 1536 case 7: | |
| 1537 col = ScrollView::WHITE; | |
| 1538 break; | |
| 1539 case 8: | |
| 1540 col = ScrollView::YELLOW; | |
| 1541 break; | |
| 1542 case 9: | |
| 1543 col = ScrollView::BLACK; | |
| 1544 break; | |
| 1545 | |
| 1546 case 20: | |
| 1547 col = ScrollView::CYAN; | |
| 1548 break; | |
| 1549 case 21: | |
| 1550 col = ScrollView::GREEN; | |
| 1551 break; | |
| 1552 case 22: | |
| 1553 col = ScrollView::MAGENTA; | |
| 1554 break; | |
| 1555 default: | |
| 1556 col = ScrollView::BLACK; | |
| 1557 } | |
| 1558 if (textord_show_initial_words) { | |
| 1559 to_win->Pen(col); | |
| 1560 /* if (rule < 20) | |
| 1561 //interior_style(to_win, INT_SOLID, false); | |
| 1562 else | |
| 1563 //interior_style(to_win, INT_HOLLOW, true);*/ | |
| 1564 // x radius | |
| 1565 to_win->Ellipse(current_gap / 2.0f, | |
| 1566 blob.height() / 2.0f, // y radius | |
| 1567 // x centre | |
| 1568 blob.left() - current_gap / 2.0f, | |
| 1569 // y centre | |
| 1570 blob.bottom() + blob.height() / 2.0f); | |
| 1571 } | |
| 1572 if (tosp_debug_level > 5) { | |
| 1573 tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2, | |
| 1574 blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap); | |
| 1575 } | |
| 1576 } | |
| 1577 #endif | |
| 1578 | |
| 1579 float Textord::find_mean_blob_spacing(WERD *word) { | |
| 1580 C_BLOB_IT cblob_it; | |
| 1581 TBOX blob_box; | |
| 1582 int32_t gap_sum = 0; | |
| 1583 int16_t gap_count = 0; | |
| 1584 int16_t prev_right; | |
| 1585 | |
| 1586 cblob_it.set_to_list(word->cblob_list()); | |
| 1587 if (!cblob_it.empty()) { | |
| 1588 cblob_it.mark_cycle_pt(); | |
| 1589 prev_right = cblob_it.data()->bounding_box().right(); | |
| 1590 // first blob | |
| 1591 cblob_it.forward(); | |
| 1592 for (; !cblob_it.cycled_list(); cblob_it.forward()) { | |
| 1593 blob_box = cblob_it.data()->bounding_box(); | |
| 1594 gap_sum += blob_box.left() - prev_right; | |
| 1595 gap_count++; | |
| 1596 prev_right = blob_box.right(); | |
| 1597 } | |
| 1598 } | |
| 1599 if (gap_count > 0) { | |
| 1600 return (gap_sum / static_cast<float>(gap_count)); | |
| 1601 } else { | |
| 1602 return 0.0f; | |
| 1603 } | |
| 1604 } | |
| 1605 | |
| 1606 bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left, | |
| 1607 int16_t right) { | |
| 1608 int16_t gap = right - left + 1; | |
| 1609 | |
| 1610 if (tosp_ignore_big_gaps > 999) { | |
| 1611 return false; // Don't ignore | |
| 1612 } | |
| 1613 if (tosp_ignore_big_gaps > 0) { | |
| 1614 return (gap > tosp_ignore_big_gaps * row->xheight); | |
| 1615 } | |
| 1616 if (gap > tosp_ignore_very_big_gaps * row->xheight) { | |
| 1617 return true; | |
| 1618 } | |
| 1619 if (tosp_ignore_big_gaps == 0) { | |
| 1620 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) { | |
| 1621 return true; | |
| 1622 } | |
| 1623 if ((gap > 1.75 * row->xheight) && | |
| 1624 ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) { | |
| 1625 return true; | |
| 1626 } | |
| 1627 } else { | |
| 1628 /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table | |
| 1629 */ | |
| 1630 if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) { | |
| 1631 return true; | |
| 1632 } | |
| 1633 } | |
| 1634 return false; | |
| 1635 } | |
| 1636 | |
| 1637 /********************************************************************** | |
| 1638 * reduced_box_next | |
| 1639 * | |
| 1640 * Compute the bounding box of this blob with merging of x overlaps | |
| 1641 * but no pre-chopping. | |
| 1642 * Then move the iterator on to the start of the next blob. | |
| 1643 * DON'T reduce the box for small things - eg punctuation. | |
| 1644 **********************************************************************/ | |
| 1645 TBOX Textord::reduced_box_next(TO_ROW *row, // current row | |
| 1646 BLOBNBOX_IT *it // iterator to blobds | |
| 1647 ) { | |
| 1648 BLOBNBOX *blob; // current blob | |
| 1649 BLOBNBOX *head_blob; // place to store box | |
| 1650 TBOX full_box; // full blob boundg box | |
| 1651 TBOX reduced_box; // box of significant part | |
| 1652 int16_t left_above_xht; // ABOVE xht left limit | |
| 1653 int16_t new_left_above_xht; // ABOVE xht left limit | |
| 1654 | |
| 1655 blob = it->data(); | |
| 1656 if (blob->red_box_set()) { | |
| 1657 reduced_box = blob->reduced_box(); | |
| 1658 do { | |
| 1659 it->forward(); | |
| 1660 blob = it->data(); | |
| 1661 } while (blob->cblob() == nullptr || blob->joined_to_prev()); | |
| 1662 return reduced_box; | |
| 1663 } | |
| 1664 head_blob = blob; | |
| 1665 full_box = blob->bounding_box(); | |
| 1666 reduced_box = reduced_box_for_blob(blob, row, &left_above_xht); | |
| 1667 do { | |
| 1668 it->forward(); | |
| 1669 blob = it->data(); | |
| 1670 if (blob->cblob() == nullptr) { | |
| 1671 // was pre-chopped | |
| 1672 full_box += blob->bounding_box(); | |
| 1673 } else if (blob->joined_to_prev()) { | |
| 1674 reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht); | |
| 1675 left_above_xht = std::min(left_above_xht, new_left_above_xht); | |
| 1676 } | |
| 1677 } | |
| 1678 // until next real blob | |
| 1679 while (blob->cblob() == nullptr || blob->joined_to_prev()); | |
| 1680 | |
| 1681 if ((reduced_box.width() > 0) && | |
| 1682 ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) && | |
| 1683 (reduced_box.height() > 0.7 * row->xheight)) { | |
| 1684 #ifndef GRAPHICS_DISABLED | |
| 1685 if (textord_show_initial_words) { | |
| 1686 reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW); | |
| 1687 } | |
| 1688 #endif | |
| 1689 } else { | |
| 1690 reduced_box = full_box; | |
| 1691 } | |
| 1692 head_blob->set_reduced_box(reduced_box); | |
| 1693 return reduced_box; | |
| 1694 } | |
| 1695 | |
| 1696 /************************************************************************* | |
| 1697 * reduced_box_for_blob() | |
| 1698 * Find box for blob which is the same height and y position as the whole blob, | |
| 1699 * but whose left limit is the left most position of the blob ABOVE the | |
| 1700 * baseline and whose right limit is the right most position of the blob BELOW | |
| 1701 * the xheight. | |
| 1702 * | |
| 1703 * | |
| 1704 * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on | |
| 1705 * "home". Perhaps we need something which say if the width ABOVE the | |
| 1706 * xht alone includes the whole of the reduced width, then use the full | |
| 1707 * blob box - Might still fail on italic F | |
| 1708 * | |
| 1709 * Alternatively we could be a little less severe and only reduce the | |
| 1710 * left and right edges by half the difference between the full box and | |
| 1711 * the reduced box. | |
| 1712 * | |
| 1713 * NOTE that we need to rotate all the coordinates as | |
| 1714 * find_blob_limits finds the y min and max within a specified x band | |
| 1715 *************************************************************************/ | |
| 1716 TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) { | |
| 1717 float baseline; | |
| 1718 float blob_x_centre; | |
| 1719 float left_limit; | |
| 1720 float right_limit; | |
| 1721 float junk; | |
| 1722 TBOX blob_box; | |
| 1723 | |
| 1724 /* Find baseline of centre of blob */ | |
| 1725 | |
| 1726 blob_box = blob->bounding_box(); | |
| 1727 blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0; | |
| 1728 baseline = row->baseline.y(blob_x_centre); | |
| 1729 | |
| 1730 /* | |
| 1731 Find LH limit of blob ABOVE the xht. This is so that we can detect certain | |
| 1732 caps ht chars which should NOT have their box reduced: T, Y, V, W etc | |
| 1733 */ | |
| 1734 left_limit = static_cast<float>(INT32_MAX); | |
| 1735 junk = static_cast<float>(-INT32_MAX); | |
| 1736 find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX), | |
| 1737 left_limit, junk); | |
| 1738 if (left_limit > junk) { | |
| 1739 *left_above_xht = INT16_MAX; // No area above xht | |
| 1740 } else { | |
| 1741 *left_above_xht = static_cast<int16_t>(std::floor(left_limit)); | |
| 1742 } | |
| 1743 /* | |
| 1744 Find reduced LH limit of blob - the left extent of the region ABOVE the | |
| 1745 baseline. | |
| 1746 */ | |
| 1747 left_limit = static_cast<float>(INT32_MAX); | |
| 1748 junk = static_cast<float>(-INT32_MAX); | |
| 1749 find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk); | |
| 1750 | |
| 1751 if (left_limit > junk) { | |
| 1752 return TBOX(); // no area within xht so return empty box | |
| 1753 } | |
| 1754 /* | |
| 1755 Find reduced RH limit of blob - the right extent of the region BELOW the xht. | |
| 1756 */ | |
| 1757 junk = static_cast<float>(INT32_MAX); | |
| 1758 right_limit = static_cast<float>(-INT32_MAX); | |
| 1759 find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk, | |
| 1760 right_limit); | |
| 1761 if (junk > right_limit) { | |
| 1762 return TBOX(); // no area within xht so return empty box | |
| 1763 } | |
| 1764 | |
| 1765 return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()), | |
| 1766 ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top())); | |
| 1767 } | |
| 1768 } // namespace tesseract |
