comparison mupdf-source/thirdparty/tesseract/src/textord/tospace.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 /**********************************************************************
11 * tospace.cpp
12 *
13 * Compute fuzzy word spacing thresholds for each row.
14 * I.e. set : max_nonspace
15 * space_threshold
16 * min_space
17 * kern_size
18 * space_size
19 * for each row.
20 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21 *
22 * Note: functions in this file were originally not members of any
23 * class or enclosed by any namespace. Now they are all static members
24 * of the Textord class.
25 *
26 **********************************************************************/
27
28 #include "drawtord.h"
29 #include "statistc.h"
30 #include "textord.h"
31 #include "tovars.h"
32
33 // Include automatically generated configuration file if running autoconf.
34 #ifdef HAVE_CONFIG_H
35 # include "config_auto.h"
36 #endif
37
38 #include <algorithm>
39 #include <cmath>
40 #include <memory>
41
42 #define MAXSPACING 128 /*max expected spacing in pix */
43
44 namespace tesseract {
45 void Textord::to_spacing(ICOORD page_tr, // topright of page
46 TO_BLOCK_LIST *blocks // blocks on page
47 ) {
48 TO_BLOCK_IT block_it; // iterator
49 TO_BLOCK *block; // current block;
50 TO_ROW *row; // current row
51 int block_index; // block number
52 int row_index; // row number
53 // estimated width of real spaces for whole block
54 int16_t block_space_gap_width;
55 // estimated width of non space gaps for whole block
56 int16_t block_non_space_gap_width;
57 bool old_text_ord_proportional; // old fixed/prop result
58
59 block_it.set_to_list(blocks);
60 block_index = 1;
61 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62 block = block_it.data();
63 std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
64 block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65 block_non_space_gap_width);
66 // Make sure relative values of block-level space and non-space gap
67 // widths are reasonable. The ratio of 1:3 is also used in
68 // block_spacing_stats, to correct the block_space_gap_width.
69 // Useful for arabic and hindi, when the non-space gap width is
70 // often over-estimated and should not be trusted. A similar ratio
71 // is found in block_spacing_stats.
72 if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73 block_non_space_gap_width > block_space_gap_width / 3) {
74 block_non_space_gap_width = block_space_gap_width / 3;
75 }
76 // row iterator
77 TO_ROW_IT row_it(block->get_rows());
78 row_index = 1;
79 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80 row = row_it.data();
81 if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
82 if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83 tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
84 }
85 row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86 block_non_space_gap_width);
87 } else {
88 if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89 tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
90 row_index, row->pitch_decision, row->fixed_pitch);
91 }
92 }
93 #ifndef GRAPHICS_DISABLED
94 if (textord_show_initial_words) {
95 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
96 }
97 #endif
98 row_index++;
99 }
100 block_index++;
101 }
102 }
103
104 /*************************************************************************
105 * block_spacing_stats()
106 *************************************************************************/
107
108 void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
109 int16_t &block_space_gap_width, // resulting estimate
110 int16_t &block_non_space_gap_width // resulting estimate
111 ) {
112 TO_ROW *row; // current row
113 BLOBNBOX_IT blob_it; // iterator
114
115 STATS centre_to_centre_stats(0, MAXSPACING - 1);
116 // DEBUG USE ONLY
117 STATS all_gap_stats(0, MAXSPACING - 1);
118 STATS space_gap_stats(0, MAXSPACING - 1);
119 int16_t minwidth = MAXSPACING; // narrowest blob
120 TBOX blob_box;
121 TBOX prev_blob_box;
122 int16_t centre_to_centre;
123 int16_t gap_width;
124 float real_space_threshold;
125 float iqr_centre_to_centre; // DEBUG USE ONLY
126 float iqr_all_gap_stats; // DEBUG USE ONLY
127 int32_t end_of_row;
128 int32_t row_length;
129
130 // row iterator
131 TO_ROW_IT row_it(block->get_rows());
132 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
133 row = row_it.data();
134 if (!row->blob_list()->empty() &&
135 (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
136 (row->pitch_decision == PITCH_CORR_PROP))) {
137 blob_it.set_to_list(row->blob_list());
138 blob_it.mark_cycle_pt();
139 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
140 if (tosp_use_pre_chopping) {
141 blob_box = box_next_pre_chopped(&blob_it);
142 } else if (tosp_stats_use_xht_gaps) {
143 blob_box = reduced_box_next(row, &blob_it);
144 } else {
145 blob_box = box_next(&blob_it);
146 }
147 row_length = end_of_row - blob_box.left();
148 if (blob_box.width() < minwidth) {
149 minwidth = blob_box.width();
150 }
151 prev_blob_box = blob_box;
152 while (!blob_it.cycled_list()) {
153 if (tosp_use_pre_chopping) {
154 blob_box = box_next_pre_chopped(&blob_it);
155 } else if (tosp_stats_use_xht_gaps) {
156 blob_box = reduced_box_next(row, &blob_it);
157 } else {
158 blob_box = box_next(&blob_it);
159 }
160 if (blob_box.width() < minwidth) {
161 minwidth = blob_box.width();
162 }
163 int16_t left = prev_blob_box.right();
164 int16_t right = blob_box.left();
165 gap_width = right - left;
166 if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
167 all_gap_stats.add(gap_width, 1);
168
169 centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
170 // DEBUG
171 centre_to_centre_stats.add(centre_to_centre, 1);
172 // DEBUG
173 }
174 prev_blob_box = blob_box;
175 }
176 }
177 }
178
179 // Inadequate samples
180 if (all_gap_stats.get_total() <= 1) {
181 block_non_space_gap_width = minwidth;
182 block_space_gap_width = -1; // No est. space width
183 // DEBUG
184 old_text_ord_proportional = true;
185 } else {
186 /* For debug only ..... */
187 iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
188 iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
189 old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
190 /* .......For debug only */
191
192 /*
193 The median of the gaps is used as an estimate of the NON-SPACE gap width.
194 This RELIES on the assumption that there are more gaps WITHIN words than
195 BETWEEN words in a block
196
197 Now try to estimate the width of a real space for all real spaces in the
198 block. Do this by using a crude threshold to ignore "narrow" gaps, then
199 find the median of the "wide" gaps and use this.
200 */
201 block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
202 // median gap
203
204 row_it.set_to_list(block->get_rows());
205 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
206 row = row_it.data();
207 if (!row->blob_list()->empty() &&
208 (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
209 (row->pitch_decision == PITCH_CORR_PROP))) {
210 real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
211 tosp_init_guess_xht_mult * row->xheight);
212 blob_it.set_to_list(row->blob_list());
213 blob_it.mark_cycle_pt();
214 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
215 if (tosp_use_pre_chopping) {
216 blob_box = box_next_pre_chopped(&blob_it);
217 } else if (tosp_stats_use_xht_gaps) {
218 blob_box = reduced_box_next(row, &blob_it);
219 } else {
220 blob_box = box_next(&blob_it);
221 }
222 row_length = blob_box.left() - end_of_row;
223 prev_blob_box = blob_box;
224 while (!blob_it.cycled_list()) {
225 if (tosp_use_pre_chopping) {
226 blob_box = box_next_pre_chopped(&blob_it);
227 } else if (tosp_stats_use_xht_gaps) {
228 blob_box = reduced_box_next(row, &blob_it);
229 } else {
230 blob_box = box_next(&blob_it);
231 }
232 int16_t left = prev_blob_box.right();
233 int16_t right = blob_box.left();
234 gap_width = right - left;
235 if ((gap_width > real_space_threshold) &&
236 !ignore_big_gap(row, row_length, gapmap, left, right)) {
237 /*
238 If tosp_use_cert_spaces is enabled, the estimate of the space gap is
239 restricted to obvious spaces - those wider than half the xht or
240 those with wide blobs on both sides - i.e not things that are
241 suspect 1's or punctuation that is sometimes widely spaced.
242 */
243 if (!tosp_block_use_cert_spaces ||
244 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
245 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
246 (!tosp_narrow_blobs_not_cert ||
247 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
248 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
249 space_gap_stats.add(gap_width, 1);
250 }
251 }
252 prev_blob_box = blob_box;
253 }
254 }
255 }
256 // Inadequate samples
257 if (space_gap_stats.get_total() <= 2) {
258 block_space_gap_width = -1; // No est. space width
259 } else {
260 block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
261 static_cast<int16_t>(3 * block_non_space_gap_width));
262 }
263 }
264 }
265
266 /*************************************************************************
267 * row_spacing_stats()
268 * Set values for min_space, max_non_space based on row stats only
269 * If failure - return 0 values.
270 *************************************************************************/
271 void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
272 int16_t block_space_gap_width, // estimate for block
273 int16_t block_non_space_gap_width // estimate for block
274 ) {
275 // iterator
276 BLOBNBOX_IT blob_it = row->blob_list();
277 STATS all_gap_stats(0, MAXSPACING - 1);
278 STATS cert_space_gap_stats(0, MAXSPACING - 1);
279 STATS all_space_gap_stats(0, MAXSPACING - 1);
280 STATS small_gap_stats(0, MAXSPACING - 1);
281 TBOX blob_box;
282 TBOX prev_blob_box;
283 int16_t gap_width;
284 int16_t real_space_threshold = 0;
285 int16_t max = 0;
286 int16_t large_gap_count = 0;
287 bool suspected_table;
288 bool good_block_space_estimate = block_space_gap_width > 0;
289 int32_t end_of_row;
290 int32_t row_length = 0;
291 float sane_space;
292 int32_t sane_threshold;
293
294 /* Collect first pass stats for row */
295
296 if (!good_block_space_estimate) {
297 block_space_gap_width = int16_t(std::floor(row->xheight / 2));
298 }
299 if (!row->blob_list()->empty()) {
300 if (tosp_threshold_bias1 > 0) {
301 real_space_threshold =
302 block_non_space_gap_width +
303 int16_t(floor(0.5 + tosp_threshold_bias1 *
304 (block_space_gap_width - block_non_space_gap_width)));
305 } else {
306 real_space_threshold = // Old TO method
307 (block_space_gap_width + block_non_space_gap_width) / 2;
308 }
309 blob_it.set_to_list(row->blob_list());
310 blob_it.mark_cycle_pt();
311 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
312 if (tosp_use_pre_chopping) {
313 blob_box = box_next_pre_chopped(&blob_it);
314 } else if (tosp_stats_use_xht_gaps) {
315 blob_box = reduced_box_next(row, &blob_it);
316 } else {
317 blob_box = box_next(&blob_it);
318 }
319 row_length = end_of_row - blob_box.left();
320 prev_blob_box = blob_box;
321 while (!blob_it.cycled_list()) {
322 if (tosp_use_pre_chopping) {
323 blob_box = box_next_pre_chopped(&blob_it);
324 } else if (tosp_stats_use_xht_gaps) {
325 blob_box = reduced_box_next(row, &blob_it);
326 } else {
327 blob_box = box_next(&blob_it);
328 }
329 int16_t left = prev_blob_box.right();
330 int16_t right = blob_box.left();
331 gap_width = right - left;
332 if (ignore_big_gap(row, row_length, gapmap, left, right)) {
333 large_gap_count++;
334 } else {
335 if (gap_width >= real_space_threshold) {
336 if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
337 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
338 (!tosp_narrow_blobs_not_cert ||
339 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
340 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
341 cert_space_gap_stats.add(gap_width, 1);
342 }
343 all_space_gap_stats.add(gap_width, 1);
344 } else {
345 small_gap_stats.add(gap_width, 1);
346 }
347 all_gap_stats.add(gap_width, 1);
348 }
349 prev_blob_box = blob_box;
350 }
351 }
352 suspected_table = (large_gap_count > 1) ||
353 ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
354
355 /* Now determine row kern size, space size and threshold */
356
357 if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
358 ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
359 cert_space_gap_stats.get_total() > 0)) {
360 old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
361 block_space_gap_width, block_non_space_gap_width);
362 } else {
363 if (!tosp_recovery_isolated_row_stats ||
364 !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
365 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
366 tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
367 }
368 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
369 // Use block default
370 row->space_size = block_space_gap_width;
371 if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
372 row->kern_size = all_gap_stats.median();
373 } else {
374 row->kern_size = block_non_space_gap_width;
375 }
376 row->space_threshold =
377 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
378 } else {
379 old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
380 block_space_gap_width, block_non_space_gap_width);
381 }
382 }
383 }
384
385 if (tosp_improve_thresh && !suspected_table) {
386 improve_row_threshold(row, &all_gap_stats);
387 }
388
389 /* Now lets try to be careful not to do anything silly with tables when we
390 are ignoring big gaps*/
391 if (tosp_sanity_method == 0) {
392 if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
393 if (tosp_debug_level > 5) {
394 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
395 row->kern_size, row->space_threshold, row->space_size);
396 }
397 row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
398 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
399 }
400 } else if (tosp_sanity_method == 1) {
401 sane_space = row->space_size;
402 /* NEVER let space size get too close to kern size */
403 if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
404 ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
405 if (good_block_space_estimate &&
406 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
407 sane_space = block_space_gap_width;
408 } else {
409 sane_space =
410 std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
411 row->xheight / 2.0f);
412 }
413 if (tosp_debug_level > 5) {
414 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
415 row->kern_size, row->space_threshold, row->space_size, sane_space);
416 }
417 row->space_size = sane_space;
418 row->space_threshold =
419 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
420 }
421 /* NEVER let threshold get VERY far away from kern */
422 sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
423 if (row->space_threshold > sane_threshold) {
424 if (tosp_debug_level > 5) {
425 tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
426 row->kern_size, row->space_threshold, row->space_size, sane_threshold);
427 }
428 row->space_threshold = sane_threshold;
429 if (row->space_size <= sane_threshold) {
430 row->space_size = row->space_threshold + 1.0f;
431 }
432 }
433 /* Beware of tables - there may be NO spaces */
434 if (suspected_table) {
435 sane_space =
436 std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
437 sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
438
439 if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
440 if (tosp_debug_level > 5) {
441 tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
442 row->kern_size, row->space_threshold, row->space_size);
443 }
444 // the minimum sane value
445 row->space_threshold = static_cast<int32_t>(sane_space);
446 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
447 }
448 }
449 }
450
451 /* Now lets try to put some error limits on the threshold */
452
453 if (tosp_old_to_method) {
454 /* Old textord made a space if gap >= threshold */
455 // NO FUZZY SPACES YET
456 row->max_nonspace = row->space_threshold;
457 // NO FUZZY SPACES YET
458 row->min_space = row->space_threshold + 1;
459 } else {
460 /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
461 row->min_space =
462 std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
463 if (row->min_space <= row->space_threshold) {
464 // Don't be silly
465 row->min_space = row->space_threshold + 1;
466 }
467 /*
468 Lets try to guess the max certain kern gap by looking at the cluster of
469 kerns for the row. The row is proportional so the kerns should cluster
470 tightly at the bottom of the distribution. We also expect most gaps to be
471 kerns. Find the maximum of the kern piles between 0 and twice the kern
472 estimate. Piles before the first one with less than 1/10 the maximum
473 number of samples can be taken as certain kerns.
474
475 Of course, there are some cases where the kern peak and space peaks merge,
476 so we will put an UPPER limit on the max certain kern gap of some fraction
477 below the threshold.
478 */
479
480 // upper bound
481 int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
482
483 // default
484 row->max_nonspace = max_max_nonspace;
485 for (int32_t index = 0; index <= max_max_nonspace; index++) {
486 if (all_gap_stats.pile_count(index) > max) {
487 max = all_gap_stats.pile_count(index);
488 }
489 if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
490 row->max_nonspace = index;
491 break;
492 }
493 }
494 }
495
496 /* Yet another algorithm - simpler this time - just choose a fraction of the
497 threshold to space range */
498
499 if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
500 row->min_space = std::max(
501 row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
502 tosp_fuzzy_sp_fraction *
503 (row->space_size - row->space_threshold))));
504 }
505
506 /* Ensure that ANY space less than some multiplier times the kern size is
507 fuzzy. In tables there is a risk of erroneously setting a small space size
508 when there are no real spaces. Sometimes tables have text squashed into
509 columns so that the kn->sp ratio is small anyway - this means that we can't
510 use this to force a wider separation - hence we rely on context to join any
511 dubious breaks. */
512
513 if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
514 row->min_space = std::max(
515 row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
516 }
517
518 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
519 row->max_nonspace = static_cast<int32_t>(floor(
520 0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
521 }
522 if (row->max_nonspace > row->space_threshold) {
523 // Don't be silly
524 row->max_nonspace = row->space_threshold;
525 }
526
527 if (tosp_debug_level > 5) {
528 tprintf(
529 "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
530 "Sp:%3.2f\n",
531 block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
532 real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
533 row->min_space, row->space_size);
534 }
535 if (tosp_debug_level > 10) {
536 tprintf(
537 "row->kern_size = %3.2f, row->space_size = %3.2f, "
538 "row->space_threshold = %d\n",
539 row->kern_size, row->space_size, row->space_threshold);
540 }
541 }
542
543 void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
544 STATS *small_gap_stats,
545 int16_t block_space_gap_width, // estimate for block
546 int16_t block_non_space_gap_width // estimate for block
547 ) {
548 /* First, estimate row space size */
549 /* Old to condition was > 2 */
550 if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
551 // Adequate samples
552 /* Set space size to median of spaces BUT limits it if it seems wildly out
553 */
554 row->space_size = space_gap_stats->median();
555 if (row->space_size > block_space_gap_width * 1.5) {
556 if (tosp_old_to_bug_fix) {
557 row->space_size = block_space_gap_width * 1.5;
558 } else {
559 // BUG??? should be *1.5
560 row->space_size = block_space_gap_width;
561 }
562 }
563 if (row->space_size < (block_non_space_gap_width * 2) + 1) {
564 row->space_size = (block_non_space_gap_width * 2) + 1;
565 }
566 }
567 // Only 1 or 2 samples
568 else if (space_gap_stats->get_total() >= 1) {
569 // hence mean not median
570 row->space_size = space_gap_stats->mean();
571 if (row->space_size > block_space_gap_width * 1.5) {
572 if (tosp_old_to_bug_fix) {
573 row->space_size = block_space_gap_width * 1.5;
574 } else {
575 // BUG??? should be *1.5
576 row->space_size = block_space_gap_width;
577 }
578 }
579 if (row->space_size < (block_non_space_gap_width * 3) + 1) {
580 row->space_size = (block_non_space_gap_width * 3) + 1;
581 }
582 } else {
583 // Use block default
584 row->space_size = block_space_gap_width;
585 }
586
587 /* Next, estimate row kern size */
588 if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
589 row->kern_size = small_gap_stats->median();
590 } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
591 row->kern_size = all_gap_stats->median();
592 } else { // old TO -SAME FOR ALL ROWS
593 row->kern_size = block_non_space_gap_width;
594 }
595
596 /* Finally, estimate row space threshold */
597 if (tosp_threshold_bias2 > 0) {
598 row->space_threshold = int32_t(
599 floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
600 } else {
601 /*
602 NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
603 and holds this in a float. The use is with a >= test
604 NEW textord uses an integer threshold and a > test
605 It comes to the same thing.
606 (Though there is a difference in that old textor has integer space_size
607 and kern_size.)
608 */
609 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
610 }
611
612 // Apply the same logic and ratios as in row_spacing_stats to
613 // restrict relative values of the row's space_size, kern_size, and
614 // space_threshold
615 if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
616 ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
617 ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
618 if (row->kern_size > 2.5) {
619 row->kern_size = row->space_size / tosp_min_sane_kn_sp;
620 }
621 row->space_threshold =
622 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
623 }
624 }
625
626 /*************************************************************************
627 * isolated_row_stats()
628 * Set values for min_space, max_non_space based on row stats only
629 *************************************************************************/
630 bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
631 bool suspected_table, int16_t block_idx, int16_t row_idx) {
632 float kern_estimate;
633 float crude_threshold_estimate;
634 int16_t small_gaps_count;
635 int16_t total;
636 // iterator
637 BLOBNBOX_IT blob_it = row->blob_list();
638 STATS cert_space_gap_stats(0, MAXSPACING - 1);
639 STATS all_space_gap_stats(0, MAXSPACING - 1);
640 STATS small_gap_stats(0, MAXSPACING - 1);
641 TBOX blob_box;
642 TBOX prev_blob_box;
643 int16_t gap_width;
644 int32_t end_of_row;
645 int32_t row_length;
646
647 kern_estimate = all_gap_stats->median();
648 crude_threshold_estimate =
649 std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
650 small_gaps_count =
651 stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
652 total = all_gap_stats->get_total();
653
654 if ((total <= tosp_redo_kern_limit) ||
655 ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
656 (total - small_gaps_count < 1)) {
657 if (tosp_debug_level > 5) {
658 tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
659 }
660 return false;
661 }
662 blob_it.set_to_list(row->blob_list());
663 blob_it.mark_cycle_pt();
664 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
665 if (tosp_use_pre_chopping) {
666 blob_box = box_next_pre_chopped(&blob_it);
667 } else if (tosp_stats_use_xht_gaps) {
668 blob_box = reduced_box_next(row, &blob_it);
669 } else {
670 blob_box = box_next(&blob_it);
671 }
672 row_length = end_of_row - blob_box.left();
673 prev_blob_box = blob_box;
674 while (!blob_it.cycled_list()) {
675 if (tosp_use_pre_chopping) {
676 blob_box = box_next_pre_chopped(&blob_it);
677 } else if (tosp_stats_use_xht_gaps) {
678 blob_box = reduced_box_next(row, &blob_it);
679 } else {
680 blob_box = box_next(&blob_it);
681 }
682 int16_t left = prev_blob_box.right();
683 int16_t right = blob_box.left();
684 gap_width = right - left;
685 if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
686 (gap_width > crude_threshold_estimate)) {
687 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
688 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
689 (!tosp_narrow_blobs_not_cert ||
690 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
691 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
692 cert_space_gap_stats.add(gap_width, 1);
693 }
694 all_space_gap_stats.add(gap_width, 1);
695 }
696 if (gap_width < crude_threshold_estimate) {
697 small_gap_stats.add(gap_width, 1);
698 }
699
700 prev_blob_box = blob_box;
701 }
702 if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
703 // median
704 row->space_size = cert_space_gap_stats.median();
705 } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
706 // to avoid spaced
707 row->space_size = cert_space_gap_stats.mean();
708 // 1's in tables
709 } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
710 // median
711 row->space_size = all_space_gap_stats.median();
712 } else {
713 row->space_size = all_space_gap_stats.mean();
714 }
715
716 if (tosp_only_small_gaps_for_kern) {
717 row->kern_size = small_gap_stats.median();
718 } else {
719 row->kern_size = all_gap_stats->median();
720 }
721 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
722 /* Sanity check */
723 if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
724 (row->space_threshold <= 0)) {
725 if (tosp_debug_level > 5) {
726 tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
727 row->kern_size, row->space_threshold, row->space_size);
728 }
729 row->kern_size = 0.0f;
730 row->space_threshold = 0;
731 row->space_size = 0.0f;
732 return false;
733 }
734
735 if (tosp_debug_level > 5) {
736 tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
737 row->space_threshold, row->space_size);
738 }
739 return true;
740 }
741
742 int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
743 int16_t index;
744 int16_t total = 0;
745
746 for (index = 0; index < threshold; index++) {
747 total += stats->pile_count(index);
748 }
749 return total;
750 }
751
752 /*************************************************************************
753 * improve_row_threshold()
754 * Try to recognise a "normal line" -
755 * > 25 gaps
756 * && space > 3 * kn && space > 10
757 * (I.e. reasonably large space and kn:sp ratio)
758 * && > 3/4 # gaps < kn + (sp - kn)/3
759 * (I.e. most gaps are well away from space estimate)
760 * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found
761 * somewhere in the histogram between kn and sp
762 * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
763 * NO!!!!! the bristol line has "11" with a gap of 12 between the
764 *1's!!! try moving the default threshold to within this band but leave the
765 * fuzzy limit calculation as at present.
766 *************************************************************************/
767 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
768 float sp = row->space_size;
769 float kn = row->kern_size;
770 int16_t reqd_zero_width = 0;
771 int16_t zero_width = 0;
772 int16_t zero_start = 0;
773 int16_t index = 0;
774
775 if (tosp_debug_level > 10) {
776 tprintf("Improve row threshold 0");
777 }
778 if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
779 (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
780 (0.75 * all_gap_stats->get_total()))) {
781 return;
782 }
783 if (tosp_debug_level > 10) {
784 tprintf(" 1");
785 }
786 /*
787 Look for the first region of all 0's in the histogram which is wider than
788 max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
789 threshold is not within it, move the threshold so that is just inside it.
790 */
791 reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
792 if (reqd_zero_width < 3) {
793 reqd_zero_width = 3;
794 }
795
796 for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
797 if (all_gap_stats->pile_count(index) == 0) {
798 if (zero_width == 0) {
799 zero_start = index;
800 }
801 zero_width++;
802 } else {
803 if (zero_width >= reqd_zero_width) {
804 break;
805 } else {
806 zero_width = 0;
807 }
808 }
809 }
810 index--;
811 if (tosp_debug_level > 10) {
812 tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
813 zero_width, zero_start, row->space_threshold);
814 }
815 if ((zero_width < reqd_zero_width) ||
816 ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
817 return;
818 }
819 if (tosp_debug_level > 10) {
820 tprintf(" 2");
821 }
822 if (row->space_threshold < zero_start) {
823 if (tosp_debug_level > 5) {
824 tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
825 index, row->space_threshold, zero_start);
826 }
827 row->space_threshold = zero_start;
828 }
829 if (row->space_threshold > index) {
830 if (tosp_debug_level > 5) {
831 tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
832 index, row->space_threshold, index);
833 }
834 row->space_threshold = index;
835 }
836 }
837
838 /**********************************************************************
839 * make_prop_words
840 *
841 * Convert a TO_ROW to a ROW.
842 **********************************************************************/
843 ROW *Textord::make_prop_words(TO_ROW *row, // row to make
844 FCOORD rotation // for drawing
845 ) {
846 bool bol; // start of line
847 /* prev_ values are for start of word being built. non prev_ values are for
848 the gap between the word being built and the next one. */
849 bool prev_fuzzy_sp; // probably space
850 bool prev_fuzzy_non; // probably not
851 uint8_t prev_blanks; // in front of word
852 bool fuzzy_sp = false; // probably space
853 bool fuzzy_non = false; // probably not
854 uint8_t blanks = 0; // in front of word
855 bool prev_gap_was_a_space = false;
856 bool break_at_next_gap = false;
857 ROW *real_row; // output row
858 C_OUTLINE_IT cout_it;
859 C_BLOB_LIST cblobs;
860 C_BLOB_IT cblob_it = &cblobs;
861 WERD_LIST words;
862 WERD *word; // new word
863 int32_t next_rep_char_word_right = INT32_MAX;
864 float repetition_spacing; // gap between repetitions
865 int32_t xstarts[2]; // row ends
866 int32_t prev_x; // end of prev blob
867 BLOBNBOX_IT box_it; // iterator
868 TBOX prev_blob_box;
869 TBOX next_blob_box;
870 int16_t prev_gap = INT16_MAX;
871 int16_t current_gap = INT16_MAX;
872 int16_t next_gap = INT16_MAX;
873 int16_t prev_within_xht_gap = INT16_MAX;
874 int16_t current_within_xht_gap = INT16_MAX;
875 int16_t next_within_xht_gap = INT16_MAX;
876 int16_t word_count = 0;
877
878 // repeated char words
879 WERD_IT rep_char_it(&(row->rep_words));
880 if (!rep_char_it.empty()) {
881 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
882 }
883
884 prev_x = -INT16_MAX;
885 cblob_it.set_to_list(&cblobs);
886 box_it.set_to_list(row->blob_list());
887 // new words
888 WERD_IT word_it(&words);
889 bol = true;
890 prev_blanks = 0;
891 prev_fuzzy_sp = false;
892 prev_fuzzy_non = false;
893 if (!box_it.empty()) {
894 xstarts[0] = box_it.data()->bounding_box().left();
895 if (xstarts[0] > next_rep_char_word_right) {
896 /* We need to insert a repeated char word at the start of the row */
897 word = rep_char_it.extract();
898 word_it.add_after_then_move(word);
899 /* Set spaces before repeated char word */
900 word->set_flag(W_BOL, true);
901 bol = false;
902 word->set_blanks(0);
903 // NO uncertainty
904 word->set_flag(W_FUZZY_SP, false);
905 word->set_flag(W_FUZZY_NON, false);
906 xstarts[0] = word->bounding_box().left();
907 /* Set spaces after repeated char word (and leave current word set) */
908 repetition_spacing = find_mean_blob_spacing(word);
909 current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
910 current_within_xht_gap = current_gap;
911 if (current_gap > tosp_rep_space * repetition_spacing) {
912 prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
913 if (prev_blanks < 1) {
914 prev_blanks = 1;
915 }
916 } else {
917 prev_blanks = 0;
918 }
919 if (tosp_debug_level > 5) {
920 tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
921 box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
922 repetition_spacing, current_gap);
923 }
924 prev_fuzzy_sp = false;
925 prev_fuzzy_non = false;
926 if (rep_char_it.empty()) {
927 next_rep_char_word_right = INT32_MAX;
928 } else {
929 rep_char_it.forward();
930 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
931 }
932 }
933
934 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
935 do {
936 auto bblob = box_it.data();
937 auto blob_box = bblob->bounding_box();
938 if (bblob->joined_to_prev()) {
939 auto cblob = bblob->remove_cblob();
940 if (cblob != nullptr) {
941 cout_it.set_to_list(cblob_it.data()->out_list());
942 cout_it.move_to_last();
943 cout_it.add_list_after(cblob->out_list());
944 delete cblob;
945 }
946 } else {
947 auto cblob = bblob->cblob();
948 if (cblob != nullptr) {
949 bblob->set_owns_cblob(false);
950 cblob_it.add_after_then_move(cblob);
951 }
952 prev_x = blob_box.right();
953 }
954 box_it.forward(); // next one
955 bblob = box_it.data();
956 blob_box = bblob->bounding_box();
957
958 if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
959 /* Real Blob - not multiple outlines or pre-chopped */
960 prev_gap = current_gap;
961 prev_within_xht_gap = current_within_xht_gap;
962 prev_blob_box = next_blob_box;
963 current_gap = next_gap;
964 current_within_xht_gap = next_within_xht_gap;
965 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
966
967 int16_t prev_gap_arg = prev_gap;
968 int16_t next_gap_arg = next_gap;
969 if (tosp_only_use_xht_gaps) {
970 prev_gap_arg = prev_within_xht_gap;
971 next_gap_arg = next_within_xht_gap;
972 }
973 // Decide if a word-break should be inserted
974 if (blob_box.left() > next_rep_char_word_right ||
975 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
976 current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
977 fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
978 box_it.at_first()) {
979 /* Form a new word out of the blobs collected */
980 word = new WERD(&cblobs, prev_blanks, nullptr);
981 word_count++;
982 word_it.add_after_then_move(word);
983 if (bol) {
984 word->set_flag(W_BOL, true);
985 bol = false;
986 }
987 if (prev_fuzzy_sp) {
988 // probably space
989 word->set_flag(W_FUZZY_SP, true);
990 } else if (prev_fuzzy_non) {
991 word->set_flag(W_FUZZY_NON, true);
992 }
993 // probably not
994
995 if (blob_box.left() > next_rep_char_word_right) {
996 /* We need to insert a repeated char word */
997 word = rep_char_it.extract();
998 word_it.add_after_then_move(word);
999
1000 /* Set spaces before repeated char word */
1001 repetition_spacing = find_mean_blob_spacing(word);
1002 current_gap = word->bounding_box().left() - prev_x;
1003 current_within_xht_gap = current_gap;
1004 if (current_gap > tosp_rep_space * repetition_spacing) {
1005 blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1006 if (blanks < 1) {
1007 blanks = 1;
1008 }
1009 } else {
1010 blanks = 0;
1011 }
1012 if (tosp_debug_level > 5) {
1013 tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1014 word->bounding_box().left(), word->bounding_box().bottom(),
1015 repetition_spacing, current_gap, blanks);
1016 }
1017 word->set_blanks(blanks);
1018 // NO uncertainty
1019 word->set_flag(W_FUZZY_SP, false);
1020 word->set_flag(W_FUZZY_NON, false);
1021
1022 /* Set spaces after repeated char word (and leave current word set)
1023 */
1024 current_gap = blob_box.left() - next_rep_char_word_right;
1025 if (current_gap > tosp_rep_space * repetition_spacing) {
1026 blanks = static_cast<uint8_t>(current_gap / row->space_size);
1027 if (blanks < 1) {
1028 blanks = 1;
1029 }
1030 } else {
1031 blanks = 0;
1032 }
1033 if (tosp_debug_level > 5) {
1034 tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
1035 }
1036 fuzzy_sp = false;
1037 fuzzy_non = false;
1038
1039 if (rep_char_it.empty()) {
1040 next_rep_char_word_right = INT32_MAX;
1041 } else {
1042 rep_char_it.forward();
1043 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1044 }
1045 }
1046
1047 if (box_it.at_first() && rep_char_it.empty()) {
1048 // at end of line
1049 word->set_flag(W_EOL, true);
1050 xstarts[1] = prev_x;
1051 } else {
1052 prev_blanks = blanks;
1053 prev_fuzzy_sp = fuzzy_sp;
1054 prev_fuzzy_non = fuzzy_non;
1055 }
1056 }
1057 }
1058 } while (!box_it.at_first()); // until back at start
1059
1060 /* Insert any further repeated char words */
1061 while (!rep_char_it.empty()) {
1062 word = rep_char_it.extract();
1063 word_it.add_after_then_move(word);
1064
1065 /* Set spaces before repeated char word */
1066 repetition_spacing = find_mean_blob_spacing(word);
1067 current_gap = word->bounding_box().left() - prev_x;
1068 if (current_gap > tosp_rep_space * repetition_spacing) {
1069 blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1070 if (blanks < 1) {
1071 blanks = 1;
1072 }
1073 } else {
1074 blanks = 0;
1075 }
1076 if (tosp_debug_level > 5) {
1077 tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1078 word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
1079 current_gap, blanks);
1080 }
1081 word->set_blanks(blanks);
1082 // NO uncertainty
1083 word->set_flag(W_FUZZY_SP, false);
1084 word->set_flag(W_FUZZY_NON, false);
1085 prev_x = word->bounding_box().right();
1086 if (rep_char_it.empty()) {
1087 // at end of line
1088 word->set_flag(W_EOL, true);
1089 xstarts[1] = prev_x;
1090 } else {
1091 rep_char_it.forward();
1092 }
1093 }
1094 real_row =
1095 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1096 word_it.set_to_list(real_row->word_list());
1097 // put words in row
1098 word_it.add_list_after(&words);
1099 real_row->recalc_bounding_box();
1100
1101 if (tosp_debug_level > 4) {
1102 tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1103 real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1104 real_row->bounding_box().right(), real_row->bounding_box().top());
1105 }
1106 return real_row;
1107 }
1108 return nullptr;
1109 }
1110
1111 /**********************************************************************
1112 * make_blob_words
1113 *
1114 * Converts words into blobs so that each blob is a single character.
1115 * Used for chopper test.
1116 **********************************************************************/
1117 ROW *Textord::make_blob_words(TO_ROW *row, // row to make
1118 FCOORD rotation // for drawing
1119 ) {
1120 bool bol; // start of line
1121 ROW *real_row; // output row
1122 C_OUTLINE_IT cout_it;
1123 C_BLOB_LIST cblobs;
1124 C_BLOB_IT cblob_it = &cblobs;
1125 WERD_LIST words;
1126 WERD *word; // new word
1127 BLOBNBOX_IT box_it; // iterator
1128 int16_t word_count = 0;
1129
1130 cblob_it.set_to_list(&cblobs);
1131 box_it.set_to_list(row->blob_list());
1132 // new words
1133 WERD_IT word_it(&words);
1134 bol = true;
1135 if (!box_it.empty()) {
1136 do {
1137 auto bblob = box_it.data();
1138 auto blob_box = bblob->bounding_box();
1139 if (bblob->joined_to_prev()) {
1140 auto cblob = bblob->remove_cblob();
1141 if (cblob != nullptr) {
1142 cout_it.set_to_list(cblob_it.data()->out_list());
1143 cout_it.move_to_last();
1144 cout_it.add_list_after(cblob->out_list());
1145 delete cblob;
1146 }
1147 } else {
1148 auto cblob = bblob->cblob();
1149 if (cblob != nullptr) {
1150 bblob->set_owns_cblob(false);
1151 cblob_it.add_after_then_move(cblob);
1152 }
1153 }
1154 box_it.forward(); // next one
1155 bblob = box_it.data();
1156 blob_box = bblob->bounding_box();
1157
1158 if (!bblob->joined_to_prev() && !cblobs.empty()) {
1159 word = new WERD(&cblobs, 1, nullptr);
1160 word_count++;
1161 word_it.add_after_then_move(word);
1162 if (bol) {
1163 word->set_flag(W_BOL, true);
1164 bol = false;
1165 }
1166 if (box_it.at_first()) { // at end of line
1167 word->set_flag(W_EOL, true);
1168 }
1169 }
1170 } while (!box_it.at_first()); // until back at start
1171 /* Setup the row with created words. */
1172 real_row =
1173 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1174 word_it.set_to_list(real_row->word_list());
1175 // put words in row
1176 word_it.add_list_after(&words);
1177 real_row->recalc_bounding_box();
1178 if (tosp_debug_level > 4) {
1179 tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1180 real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1181 real_row->bounding_box().right(), real_row->bounding_box().top());
1182 }
1183 return real_row;
1184 }
1185 return nullptr;
1186 }
1187
1188 bool Textord::make_a_word_break(TO_ROW *row, // row being made
1189 TBOX blob_box, // for next_blob // how many blanks?
1190 int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
1191 int16_t within_xht_current_gap, TBOX next_blob_box,
1192 int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
1193 bool &prev_gap_was_a_space, bool &break_at_next_gap) {
1194 bool space;
1195 int16_t current_gap;
1196 float fuzzy_sp_to_kn_limit;
1197
1198 if (break_at_next_gap) {
1199 break_at_next_gap = false;
1200 return true;
1201 }
1202 /* Inhibit using the reduced gap if
1203 The kerning is large - chars are not kerned and reducing "f"s can cause
1204 erroneous blanks
1205 OR The real gap is less than 0
1206 OR The real gap is less than the kerning estimate
1207 */
1208 if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1209 ((tosp_dont_fool_with_small_kerns >= 0) &&
1210 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
1211 // Ignore the difference
1212 within_xht_current_gap = real_current_gap;
1213 }
1214
1215 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
1216 current_gap = within_xht_current_gap;
1217 } else {
1218 current_gap = real_current_gap;
1219 }
1220
1221 if (tosp_old_to_method) {
1222 // Boring old method
1223 space = current_gap > row->max_nonspace;
1224 if (space && (current_gap < INT16_MAX)) {
1225 if (current_gap < row->min_space) {
1226 if (current_gap > row->space_threshold) {
1227 blanks = 1;
1228 fuzzy_sp = true;
1229 fuzzy_non = false;
1230 } else {
1231 blanks = 0;
1232 fuzzy_sp = false;
1233 fuzzy_non = true;
1234 }
1235 } else {
1236 if (row->space_size == 0.0f) {
1237 // Avoid FP division by 0.
1238 blanks = 1;
1239 } else {
1240 blanks = static_cast<uint8_t>(current_gap / row->space_size);
1241 if (blanks < 1) {
1242 blanks = 1;
1243 }
1244 }
1245 fuzzy_sp = false;
1246 fuzzy_non = false;
1247 }
1248 }
1249 return space;
1250 } else {
1251 /* New exciting heuristic method */
1252 if (prev_blob_box.null_box()) { // Beginning of row
1253 prev_gap_was_a_space = true;
1254 }
1255
1256 // Default as old TO
1257 space = current_gap > row->space_threshold;
1258
1259 /* Set defaults for the word break in case we find one. Currently there are
1260 no fuzzy spaces. Depending on the reliability of the different heuristics
1261 we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1262 be used if the function returns true - ie the word is to be broken.
1263 */
1264 int num_blanks = current_gap;
1265 if (row->space_size > 1.0f) {
1266 num_blanks = IntCastRounded(current_gap / row->space_size);
1267 }
1268 blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1269 fuzzy_sp = false;
1270 fuzzy_non = false;
1271 /*
1272 If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1273 despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1274 context.
1275 */
1276 if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
1277 (within_xht_current_gap > row->max_nonspace)) {
1278 space = true;
1279 fuzzy_non = true;
1280 #ifndef GRAPHICS_DISABLED
1281 mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1282 next_gap);
1283 #endif
1284 } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
1285 (within_xht_current_gap > row->space_threshold)) {
1286 space = true;
1287 if (tosp_flip_fuzz_kn_to_sp) {
1288 fuzzy_sp = true;
1289 } else {
1290 fuzzy_non = true;
1291 }
1292 #ifndef GRAPHICS_DISABLED
1293 mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1294 next_gap);
1295 #endif
1296 } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
1297 (within_xht_current_gap >= row->min_space)) {
1298 space = true;
1299 #ifndef GRAPHICS_DISABLED
1300 mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1301 next_gap);
1302 #endif
1303 } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
1304 suspected_punct_blob(row, blob_box)) {
1305 break_at_next_gap = true;
1306 }
1307 /* Now continue with normal heuristics */
1308 else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
1309 /* Heuristics to turn dubious spaces to kerns */
1310 if (tosp_pass_wide_fuzz_sp_to_context > 0) {
1311 fuzzy_sp_to_kn_limit =
1312 row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
1313 } else {
1314 fuzzy_sp_to_kn_limit = 99999.0f;
1315 }
1316
1317 /* If current gap is significantly smaller than the previous space the
1318 other side of a narrow blob then this gap is a kern. */
1319 if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
1320 (current_gap <= tosp_gap_factor * prev_gap)) {
1321 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1322 if (tosp_flip_fuzz_sp_to_kn) {
1323 fuzzy_non = true;
1324 } else {
1325 fuzzy_sp = true;
1326 }
1327 } else {
1328 space = false;
1329 }
1330 #ifndef GRAPHICS_DISABLED
1331 mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1332 next_gap);
1333 #endif
1334 }
1335 /* If current gap not much bigger than the previous kern the other side of
1336 a narrow blob then this gap is a kern as well */
1337 else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
1338 !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
1339 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1340 if (tosp_flip_fuzz_sp_to_kn) {
1341 fuzzy_non = true;
1342 } else {
1343 fuzzy_sp = true;
1344 }
1345 } else {
1346 space = false;
1347 }
1348 #ifndef GRAPHICS_DISABLED
1349 mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1350 next_gap);
1351 #endif
1352 } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1353 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
1354 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1355 if (tosp_flip_fuzz_sp_to_kn) {
1356 fuzzy_non = true;
1357 } else {
1358 fuzzy_sp = true;
1359 }
1360 } else {
1361 space = false;
1362 }
1363 #ifndef GRAPHICS_DISABLED
1364 mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1365 next_gap);
1366 #endif
1367 } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1368 (next_gap <= row->space_threshold) &&
1369 (current_gap * tosp_gap_factor <= next_gap)) {
1370 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1371 if (tosp_flip_fuzz_sp_to_kn) {
1372 fuzzy_non = true;
1373 } else {
1374 fuzzy_sp = true;
1375 }
1376 } else {
1377 space = false;
1378 }
1379 #ifndef GRAPHICS_DISABLED
1380 mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1381 next_gap);
1382 #endif
1383 } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
1384 ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
1385 fuzzy_sp = true;
1386 #ifndef GRAPHICS_DISABLED
1387 mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1388 next_gap);
1389 #endif
1390 }
1391 } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
1392 /* Heuristics to turn dubious kerns to spaces */
1393 /* TRIED THIS BUT IT MADE THINGS WORSE
1394 if (prev_gap == INT16_MAX)
1395 prev_gap = 0; // start of row
1396 if (next_gap == INT16_MAX)
1397 next_gap = 0; // end of row
1398 */
1399 if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
1400 (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1401 wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
1402 space = true;
1403 /*
1404 tosp_flip_caution is an attempt to stop the default changing in cases
1405 where there is a large difference between the kern and space estimates.
1406 See problem in 'chiefs' where "have" gets split in the quotation.
1407 */
1408 if ((tosp_flip_fuzz_kn_to_sp) &&
1409 ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
1410 fuzzy_sp = true;
1411 } else {
1412 fuzzy_non = true;
1413 }
1414 #ifndef GRAPHICS_DISABLED
1415 mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1416 next_gap);
1417 #endif
1418 } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
1419 current_gap > 5 && // Rule 9 handles small gap, big ratio.
1420 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1421 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
1422 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
1423 space = true;
1424 fuzzy_non = true;
1425 #ifndef GRAPHICS_DISABLED
1426 mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1427 next_gap);
1428 #endif
1429 } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
1430 (next_blob_box.width() > 0) &&
1431 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1432 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
1433 !suspected_punct_blob(row, next_blob_box)))) {
1434 space = true;
1435 fuzzy_non = true;
1436 #ifndef GRAPHICS_DISABLED
1437 mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1438 next_gap);
1439 #endif
1440 }
1441 }
1442 if (tosp_debug_level > 10) {
1443 tprintf(
1444 "word break = %d current_gap = %d, prev_gap = %d, "
1445 "next_gap = %d\n",
1446 space ? 1 : 0, current_gap, prev_gap, next_gap);
1447 }
1448 prev_gap_was_a_space = space && !(fuzzy_non);
1449 return space;
1450 }
1451 }
1452
1453 bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1454 bool result;
1455 result =
1456 ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
1457 ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
1458 return result;
1459 }
1460
1461 bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1462 bool result;
1463 if (tosp_wide_fraction > 0) {
1464 if (tosp_wide_aspect_ratio > 0) {
1465 result =
1466 ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
1467 ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
1468 } else {
1469 result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
1470 }
1471 } else {
1472 result = !narrow_blob(row, blob_box);
1473 }
1474 return result;
1475 }
1476
1477 bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1478 bool result;
1479 float baseline;
1480 float blob_x_centre;
1481 /* Find baseline of centre of blob */
1482 blob_x_centre = (box.right() + box.left()) / 2.0;
1483 baseline = row->baseline.y(blob_x_centre);
1484
1485 result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
1486 (box.bottom() > baseline + row->xheight / 2.0);
1487 return result;
1488 }
1489
1490 void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
1491 int16_t &next_gap, int16_t &next_within_xht_gap) {
1492 TBOX next_reduced_blob_box;
1493 TBOX bit_beyond;
1494 BLOBNBOX_IT reduced_box_it = box_it;
1495
1496 next_blob_box = box_next(&box_it);
1497 next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
1498 if (box_it.at_first()) {
1499 next_gap = INT16_MAX;
1500 next_within_xht_gap = INT16_MAX;
1501 } else {
1502 bit_beyond = box_it.data()->bounding_box();
1503 next_gap = bit_beyond.left() - next_blob_box.right();
1504 bit_beyond = reduced_box_next(row, &reduced_box_it);
1505 next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
1506 }
1507 }
1508
1509 #ifndef GRAPHICS_DISABLED
1510 void Textord::mark_gap(TBOX blob, // blob following gap
1511 int16_t rule, // heuristic id
1512 int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
1513 int16_t next_blob_width, int16_t next_gap) {
1514 ScrollView::Color col; // of ellipse marking flipped gap
1515
1516 switch (rule) {
1517 case 1:
1518 col = ScrollView::RED;
1519 break;
1520 case 2:
1521 col = ScrollView::CYAN;
1522 break;
1523 case 3:
1524 col = ScrollView::GREEN;
1525 break;
1526 case 4:
1527 col = ScrollView::BLACK;
1528 break;
1529 case 5:
1530 col = ScrollView::MAGENTA;
1531 break;
1532 case 6:
1533 col = ScrollView::BLUE;
1534 break;
1535
1536 case 7:
1537 col = ScrollView::WHITE;
1538 break;
1539 case 8:
1540 col = ScrollView::YELLOW;
1541 break;
1542 case 9:
1543 col = ScrollView::BLACK;
1544 break;
1545
1546 case 20:
1547 col = ScrollView::CYAN;
1548 break;
1549 case 21:
1550 col = ScrollView::GREEN;
1551 break;
1552 case 22:
1553 col = ScrollView::MAGENTA;
1554 break;
1555 default:
1556 col = ScrollView::BLACK;
1557 }
1558 if (textord_show_initial_words) {
1559 to_win->Pen(col);
1560 /* if (rule < 20)
1561 //interior_style(to_win, INT_SOLID, false);
1562 else
1563 //interior_style(to_win, INT_HOLLOW, true);*/
1564 // x radius
1565 to_win->Ellipse(current_gap / 2.0f,
1566 blob.height() / 2.0f, // y radius
1567 // x centre
1568 blob.left() - current_gap / 2.0f,
1569 // y centre
1570 blob.bottom() + blob.height() / 2.0f);
1571 }
1572 if (tosp_debug_level > 5) {
1573 tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
1574 blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
1575 }
1576 }
1577 #endif
1578
1579 float Textord::find_mean_blob_spacing(WERD *word) {
1580 C_BLOB_IT cblob_it;
1581 TBOX blob_box;
1582 int32_t gap_sum = 0;
1583 int16_t gap_count = 0;
1584 int16_t prev_right;
1585
1586 cblob_it.set_to_list(word->cblob_list());
1587 if (!cblob_it.empty()) {
1588 cblob_it.mark_cycle_pt();
1589 prev_right = cblob_it.data()->bounding_box().right();
1590 // first blob
1591 cblob_it.forward();
1592 for (; !cblob_it.cycled_list(); cblob_it.forward()) {
1593 blob_box = cblob_it.data()->bounding_box();
1594 gap_sum += blob_box.left() - prev_right;
1595 gap_count++;
1596 prev_right = blob_box.right();
1597 }
1598 }
1599 if (gap_count > 0) {
1600 return (gap_sum / static_cast<float>(gap_count));
1601 } else {
1602 return 0.0f;
1603 }
1604 }
1605
1606 bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
1607 int16_t right) {
1608 int16_t gap = right - left + 1;
1609
1610 if (tosp_ignore_big_gaps > 999) {
1611 return false; // Don't ignore
1612 }
1613 if (tosp_ignore_big_gaps > 0) {
1614 return (gap > tosp_ignore_big_gaps * row->xheight);
1615 }
1616 if (gap > tosp_ignore_very_big_gaps * row->xheight) {
1617 return true;
1618 }
1619 if (tosp_ignore_big_gaps == 0) {
1620 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
1621 return true;
1622 }
1623 if ((gap > 1.75 * row->xheight) &&
1624 ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
1625 return true;
1626 }
1627 } else {
1628 /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
1629 */
1630 if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
1631 return true;
1632 }
1633 }
1634 return false;
1635 }
1636
1637 /**********************************************************************
1638 * reduced_box_next
1639 *
1640 * Compute the bounding box of this blob with merging of x overlaps
1641 * but no pre-chopping.
1642 * Then move the iterator on to the start of the next blob.
1643 * DON'T reduce the box for small things - eg punctuation.
1644 **********************************************************************/
1645 TBOX Textord::reduced_box_next(TO_ROW *row, // current row
1646 BLOBNBOX_IT *it // iterator to blobds
1647 ) {
1648 BLOBNBOX *blob; // current blob
1649 BLOBNBOX *head_blob; // place to store box
1650 TBOX full_box; // full blob boundg box
1651 TBOX reduced_box; // box of significant part
1652 int16_t left_above_xht; // ABOVE xht left limit
1653 int16_t new_left_above_xht; // ABOVE xht left limit
1654
1655 blob = it->data();
1656 if (blob->red_box_set()) {
1657 reduced_box = blob->reduced_box();
1658 do {
1659 it->forward();
1660 blob = it->data();
1661 } while (blob->cblob() == nullptr || blob->joined_to_prev());
1662 return reduced_box;
1663 }
1664 head_blob = blob;
1665 full_box = blob->bounding_box();
1666 reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
1667 do {
1668 it->forward();
1669 blob = it->data();
1670 if (blob->cblob() == nullptr) {
1671 // was pre-chopped
1672 full_box += blob->bounding_box();
1673 } else if (blob->joined_to_prev()) {
1674 reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
1675 left_above_xht = std::min(left_above_xht, new_left_above_xht);
1676 }
1677 }
1678 // until next real blob
1679 while (blob->cblob() == nullptr || blob->joined_to_prev());
1680
1681 if ((reduced_box.width() > 0) &&
1682 ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
1683 (reduced_box.height() > 0.7 * row->xheight)) {
1684 #ifndef GRAPHICS_DISABLED
1685 if (textord_show_initial_words) {
1686 reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
1687 }
1688 #endif
1689 } else {
1690 reduced_box = full_box;
1691 }
1692 head_blob->set_reduced_box(reduced_box);
1693 return reduced_box;
1694 }
1695
1696 /*************************************************************************
1697 * reduced_box_for_blob()
1698 * Find box for blob which is the same height and y position as the whole blob,
1699 * but whose left limit is the left most position of the blob ABOVE the
1700 * baseline and whose right limit is the right most position of the blob BELOW
1701 * the xheight.
1702 *
1703 *
1704 * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1705 * "home". Perhaps we need something which say if the width ABOVE the
1706 * xht alone includes the whole of the reduced width, then use the full
1707 * blob box - Might still fail on italic F
1708 *
1709 * Alternatively we could be a little less severe and only reduce the
1710 * left and right edges by half the difference between the full box and
1711 * the reduced box.
1712 *
1713 * NOTE that we need to rotate all the coordinates as
1714 * find_blob_limits finds the y min and max within a specified x band
1715 *************************************************************************/
1716 TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
1717 float baseline;
1718 float blob_x_centre;
1719 float left_limit;
1720 float right_limit;
1721 float junk;
1722 TBOX blob_box;
1723
1724 /* Find baseline of centre of blob */
1725
1726 blob_box = blob->bounding_box();
1727 blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
1728 baseline = row->baseline.y(blob_x_centre);
1729
1730 /*
1731 Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1732 caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1733 */
1734 left_limit = static_cast<float>(INT32_MAX);
1735 junk = static_cast<float>(-INT32_MAX);
1736 find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
1737 left_limit, junk);
1738 if (left_limit > junk) {
1739 *left_above_xht = INT16_MAX; // No area above xht
1740 } else {
1741 *left_above_xht = static_cast<int16_t>(std::floor(left_limit));
1742 }
1743 /*
1744 Find reduced LH limit of blob - the left extent of the region ABOVE the
1745 baseline.
1746 */
1747 left_limit = static_cast<float>(INT32_MAX);
1748 junk = static_cast<float>(-INT32_MAX);
1749 find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);
1750
1751 if (left_limit > junk) {
1752 return TBOX(); // no area within xht so return empty box
1753 }
1754 /*
1755 Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1756 */
1757 junk = static_cast<float>(INT32_MAX);
1758 right_limit = static_cast<float>(-INT32_MAX);
1759 find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
1760 right_limit);
1761 if (junk > right_limit) {
1762 return TBOX(); // no area within xht so return empty box
1763 }
1764
1765 return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
1766 ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
1767 }
1768 } // namespace tesseract