comparison mupdf-source/thirdparty/tesseract/src/textord/wordseg.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: wordseg.cpp (Formerly wspace.c)
3 * Description: Code to segment the blobs into words.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "wordseg.h"
25
26 #include <cmath>
27
28 #include "blobbox.h"
29 #include "cjkpitch.h"
30 #include "drawtord.h"
31 #include "fpchop.h"
32 #include "makerow.h"
33 #include "pitsync1.h"
34 #include "statistc.h"
35 #include "textord.h"
36 #include "topitch.h"
37 #include "tovars.h"
38
39 namespace tesseract {
40
41 BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42 BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43
44 #define BLOCK_STATS_CLUSTERS 10
45
46 /**
47 * @name make_single_word
48 *
49 * For each row, arrange the blobs into one word. There is no fixed
50 * pitch detection.
51 */
52
53 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
54 TO_ROW_IT to_row_it(rows);
55 ROW_IT row_it(real_rows);
56 for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57 TO_ROW *row = to_row_it.data();
58 // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59 // to create the word.
60 C_BLOB_LIST cblobs;
61 C_BLOB_IT cblob_it(&cblobs);
62 BLOBNBOX_IT box_it(row->blob_list());
63 for (; !box_it.empty(); box_it.forward()) {
64 BLOBNBOX *bblob = box_it.extract();
65 if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
66 auto cblob = bblob->remove_cblob();
67 if (cblob != nullptr) {
68 C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69 cout_it.move_to_last();
70 cout_it.add_list_after(cblob->out_list());
71 delete cblob;
72 }
73 } else {
74 auto cblob = bblob->remove_cblob();
75 if (cblob != nullptr) {
76 cblob_it.add_after_then_move(cblob);
77 }
78 }
79 delete bblob;
80 }
81 // Convert the TO_ROW to a ROW.
82 ROW *real_row =
83 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84 WERD_IT word_it(real_row->word_list());
85 WERD *word = new WERD(&cblobs, 0, nullptr);
86 word->set_flag(W_BOL, true);
87 word->set_flag(W_EOL, true);
88 word->set_flag(W_DONT_CHOP, one_blob);
89 word_it.add_after_then_move(word);
90 real_row->recalc_bounding_box();
91 row_it.add_after_then_move(real_row);
92 }
93 }
94
95 /**
96 * make_words
97 *
98 * Arrange the blobs into words.
99 */
100 void make_words(tesseract::Textord *textord,
101 ICOORD page_tr, // top right
102 float gradient, // page skew
103 BLOCK_LIST *blocks, // block list
104 TO_BLOCK_LIST *port_blocks) { // output list
105 TO_BLOCK_IT block_it; // iterator
106 TO_BLOCK *block; // current block
107
108 if (textord->use_cjk_fp_model()) {
109 compute_fixed_pitch_cjk(page_tr, port_blocks);
110 } else {
111 compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
112 !bool(textord_test_landscape));
113 }
114 textord->to_spacing(page_tr, port_blocks);
115 block_it.set_to_list(port_blocks);
116 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
117 block = block_it.data();
118 make_real_words(textord, block, FCOORD(1.0f, 0.0f));
119 }
120 }
121
122 /**
123 * @name set_row_spaces
124 *
125 * Set the min_space and max_nonspace members of the row so that
126 * the blobs can be arranged into words.
127 */
128
129 void set_row_spaces( // find space sizes
130 TO_BLOCK *block, // block to do
131 FCOORD rotation, // for drawing
132 bool testing_on // correct orientation
133 ) {
134 TO_ROW *row; // current row
135 TO_ROW_IT row_it = block->get_rows();
136
137 if (row_it.empty()) {
138 return; // empty block
139 }
140 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
141 row = row_it.data();
142 if (row->fixed_pitch == 0) {
143 row->min_space = static_cast<int32_t>(
144 ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
145 row->max_nonspace = static_cast<int32_t>(
146 floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
147 if (testing_on && textord_show_initial_words) {
148 tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
149 row->min_space, row->intercept());
150 }
151 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
152 row->space_size = row->pr_space;
153 row->kern_size = row->pr_nonsp;
154 }
155 #ifndef GRAPHICS_DISABLED
156 if (textord_show_initial_words && testing_on) {
157 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
158 }
159 #endif
160 }
161 }
162
163 /**
164 * @name row_words
165 *
166 * Compute the max nonspace and min space for the row.
167 */
168
169 int32_t row_words( // compute space size
170 TO_BLOCK *block, // block it came from
171 TO_ROW *row, // row to operate on
172 int32_t maxwidth, // max expected space size
173 FCOORD rotation, // for drawing
174 bool testing_on // for debug
175 ) {
176 bool testing_row; // contains testpt
177 bool prev_valid; // if decent size
178 int32_t prev_x; // end of prev blob
179 int32_t cluster_count; // no of clusters
180 int32_t gap_index; // which cluster
181 int32_t smooth_factor; // for smoothing stats
182 BLOBNBOX *blob; // current blob
183 float lower, upper; // clustering parameters
184 float gaps[3]; // gap clusers
185 ICOORD testpt;
186 TBOX blob_box; // bounding box
187 // iterator
188 BLOBNBOX_IT blob_it = row->blob_list();
189 STATS gap_stats(0, maxwidth - 1);
190 STATS cluster_stats[4]; // clusters
191
192 testpt = ICOORD(textord_test_x, textord_test_y);
193 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
194 // if (testing_on)
195 // tprintf("Row smooth factor=%d\n",smooth_factor);
196 prev_valid = false;
197 prev_x = -INT32_MAX;
198 testing_row = false;
199 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
200 blob = blob_it.data();
201 blob_box = blob->bounding_box();
202 if (blob_box.contains(testpt)) {
203 testing_row = true;
204 }
205 gap_stats.add(blob_box.width(), 1);
206 }
207 gap_stats.clear();
208 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
209 blob = blob_it.data();
210 if (!blob->joined_to_prev()) {
211 blob_box = blob->bounding_box();
212 if (prev_valid && blob_box.left() - prev_x < maxwidth) {
213 gap_stats.add(blob_box.left() - prev_x, 1);
214 }
215 prev_valid = true;
216 prev_x = blob_box.right();
217 }
218 }
219 if (gap_stats.get_total() == 0) {
220 row->min_space = 0; // no evidence
221 row->max_nonspace = 0;
222 return 0;
223 }
224 gap_stats.smooth(smooth_factor);
225 lower = row->xheight * textord_words_initial_lower;
226 upper = row->xheight * textord_words_initial_upper;
227 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
228 while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
229 // shrink gap
230 upper = (upper * 3 + lower) / 4;
231 lower = (lower * 3 + upper) / 4;
232 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
233 }
234 if (cluster_count < 2) {
235 row->min_space = 0; // no evidence
236 row->max_nonspace = 0;
237 return 0;
238 }
239 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
240 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
241 }
242 // get medians
243 if (cluster_count > 2) {
244 if (testing_on && textord_show_initial_words) {
245 tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
246 cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
247 }
248 lower = gaps[0];
249 if (gaps[1] > lower) {
250 upper = gaps[1]; // prefer most frequent
251 if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
252 upper = gaps[2];
253 }
254 } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
255 upper = gaps[2];
256 } else if (lower >= block->xheight * textord_words_min_minspace) {
257 upper = lower; // not nice
258 lower = gaps[1];
259 if (testing_on && textord_show_initial_words) {
260 tprintf("Had to switch most common from lower to upper!!\n");
261 gap_stats.print();
262 }
263 } else {
264 row->min_space = 0; // no evidence
265 row->max_nonspace = 0;
266 return 0;
267 }
268 } else {
269 if (gaps[1] < gaps[0]) {
270 if (testing_on && textord_show_initial_words) {
271 tprintf("Had to switch most common from lower to upper!!\n");
272 gap_stats.print();
273 }
274 lower = gaps[1];
275 upper = gaps[0];
276 } else {
277 upper = gaps[1];
278 lower = gaps[0];
279 }
280 }
281 if (upper < block->xheight * textord_words_min_minspace) {
282 row->min_space = 0; // no evidence
283 row->max_nonspace = 0;
284 return 0;
285 }
286 if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
287 lower * 3 > block->min_space * 2 + block->max_nonspace) {
288 if (testing_on && textord_show_initial_words) {
289 tprintf("Disagreement between block and row at %g!!\n", row->intercept());
290 tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
291 gap_stats.print();
292 }
293 }
294 row->min_space =
295 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
296 row->max_nonspace =
297 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
298 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
299 row->space_size = upper;
300 row->kern_size = lower;
301 if (testing_on && textord_show_initial_words) {
302 if (testing_row) {
303 tprintf("GAP STATS\n");
304 gap_stats.print();
305 tprintf("SPACE stats\n");
306 cluster_stats[2].print_summary();
307 tprintf("NONSPACE stats\n");
308 cluster_stats[1].print_summary();
309 }
310 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
311 upper, row->max_nonspace, lower);
312 }
313 return cluster_stats[2].get_total();
314 }
315
316 /**
317 * @name row_words2
318 *
319 * Compute the max nonspace and min space for the row.
320 */
321
322 int32_t row_words2( // compute space size
323 TO_BLOCK *block, // block it came from
324 TO_ROW *row, // row to operate on
325 int32_t maxwidth, // max expected space size
326 FCOORD rotation, // for drawing
327 bool testing_on // for debug
328 ) {
329 bool prev_valid; // if decent size
330 bool this_valid; // current blob big enough
331 int32_t prev_x; // end of prev blob
332 int32_t min_width; // min interesting width
333 int32_t valid_count; // good gaps
334 int32_t total_count; // total gaps
335 int32_t cluster_count; // no of clusters
336 int32_t prev_count; // previous cluster_count
337 int32_t gap_index; // which cluster
338 int32_t smooth_factor; // for smoothing stats
339 BLOBNBOX *blob; // current blob
340 float lower, upper; // clustering parameters
341 ICOORD testpt;
342 TBOX blob_box; // bounding box
343 // iterator
344 BLOBNBOX_IT blob_it = row->blob_list();
345 STATS gap_stats(0, maxwidth - 1);
346 // gap sizes
347 float gaps[BLOCK_STATS_CLUSTERS];
348 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
349 // clusters
350
351 testpt = ICOORD(textord_test_x, textord_test_y);
352 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
353 // if (testing_on)
354 // tprintf("Row smooth factor=%d\n",smooth_factor);
355 prev_valid = false;
356 prev_x = -INT16_MAX;
357 const bool testing_row = false;
358 // min blob size
359 min_width = static_cast<int32_t>(block->pr_space);
360 total_count = 0;
361 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
362 blob = blob_it.data();
363 if (!blob->joined_to_prev()) {
364 blob_box = blob->bounding_box();
365 this_valid = blob_box.width() >= min_width;
366 if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
367 gap_stats.add(blob_box.left() - prev_x, 1);
368 }
369 total_count++; // count possibles
370 prev_x = blob_box.right();
371 prev_valid = this_valid;
372 }
373 }
374 valid_count = gap_stats.get_total();
375 if (valid_count < total_count * textord_words_minlarge) {
376 gap_stats.clear();
377 prev_x = -INT16_MAX;
378 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
379 blob = blob_it.data();
380 if (!blob->joined_to_prev()) {
381 blob_box = blob->bounding_box();
382 if (blob_box.left() - prev_x < maxwidth) {
383 gap_stats.add(blob_box.left() - prev_x, 1);
384 }
385 prev_x = blob_box.right();
386 }
387 }
388 }
389 if (gap_stats.get_total() == 0) {
390 row->min_space = 0; // no evidence
391 row->max_nonspace = 0;
392 return 0;
393 }
394
395 cluster_count = 0;
396 lower = block->xheight * words_initial_lower;
397 upper = block->xheight * words_initial_upper;
398 gap_stats.smooth(smooth_factor);
399 do {
400 prev_count = cluster_count;
401 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
402 BLOCK_STATS_CLUSTERS, cluster_stats);
403 } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
404 if (cluster_count < 1) {
405 row->min_space = 0;
406 row->max_nonspace = 0;
407 return 0;
408 }
409 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
410 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
411 }
412 // get medians
413 if (testing_on) {
414 tprintf("cluster_count=%d:", cluster_count);
415 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
416 tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
417 }
418 tprintf("\n");
419 }
420
421 // Try to find proportional non-space and space for row.
422 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
423 gap_index++) {
424 ;
425 }
426 if (gap_index < cluster_count) {
427 lower = gaps[gap_index]; // most frequent below
428 } else {
429 if (testing_on) {
430 tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
431 }
432 lower = block->pr_nonsp;
433 }
434 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
435 gap_index++) {
436 ;
437 }
438 if (gap_index < cluster_count) {
439 upper = gaps[gap_index]; // most frequent above
440 } else {
441 if (testing_on) {
442 tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
443 }
444 upper = block->pr_space;
445 }
446 row->min_space =
447 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
448 row->max_nonspace =
449 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
450 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
451 row->space_size = upper;
452 row->kern_size = lower;
453 if (testing_on) {
454 if (testing_row) {
455 tprintf("GAP STATS\n");
456 gap_stats.print();
457 tprintf("SPACE stats\n");
458 cluster_stats[2].print_summary();
459 tprintf("NONSPACE stats\n");
460 cluster_stats[1].print_summary();
461 }
462 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
463 upper, row->max_nonspace, lower);
464 }
465 return 1;
466 }
467
468 /**
469 * @name make_real_words
470 *
471 * Convert a TO_BLOCK to a BLOCK.
472 */
473
474 void make_real_words(tesseract::Textord *textord,
475 TO_BLOCK *block, // block to do
476 FCOORD rotation // for drawing
477 ) {
478 TO_ROW *row; // current row
479 TO_ROW_IT row_it = block->get_rows();
480 ROW *real_row = nullptr; // output row
481 ROW_IT real_row_it = block->block->row_list();
482
483 if (row_it.empty()) {
484 return; // empty block
485 }
486 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
487 row = row_it.data();
488 if (row->blob_list()->empty() && !row->rep_words.empty()) {
489 real_row = make_rep_words(row, block);
490 } else if (!row->blob_list()->empty()) {
491 // In a fixed pitch document, some lines may be detected as fixed pitch
492 // while others don't, and will go through different path.
493 // For non-space delimited language like CJK, fixed pitch chop always
494 // leave the entire line as one word. We can force consistent chopping
495 // with force_make_prop_words flag.
496 POLY_BLOCK *pb = block->block->pdblk.poly_block();
497 if (textord_chopper_test) {
498 real_row = textord->make_blob_words(row, rotation);
499 } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
500 row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) {
501 real_row = textord->make_prop_words(row, rotation);
502 } else if (row->pitch_decision == PITCH_DEF_FIXED ||
503 row->pitch_decision == PITCH_CORR_FIXED) {
504 real_row = fixed_pitch_words(row, rotation);
505 } else {
506 ASSERT_HOST(false);
507 }
508 }
509 if (real_row != nullptr) {
510 // put row in block
511 real_row_it.add_after_then_move(real_row);
512 }
513 }
514 block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
515 static_cast<int16_t>(block->space_size),
516 static_cast<int16_t>(block->fixed_pitch));
517 block->block->check_pitch();
518 }
519
520 /**
521 * @name make_rep_words
522 *
523 * Fabricate a real row from only the repeated blob words.
524 * Get the xheight from the block as it may be more meaningful.
525 */
526
527 ROW *make_rep_words( // make a row
528 TO_ROW *row, // row to convert
529 TO_BLOCK *block // block it lives in
530 ) {
531 ROW *real_row; // output row
532 TBOX word_box; // bounding box
533 // iterator
534 WERD_IT word_it = &row->rep_words;
535
536 if (word_it.empty()) {
537 return nullptr;
538 }
539 word_box = word_it.data()->bounding_box();
540 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
541 word_box += word_it.data()->bounding_box();
542 }
543 row->xheight = block->xheight;
544 real_row =
545 new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
546 word_it.set_to_list(real_row->word_list());
547 // put words in row
548 word_it.add_list_after(&row->rep_words);
549 real_row->recalc_bounding_box();
550 return real_row;
551 }
552
553 /**
554 * @name make_real_word
555 *
556 * Construct a WERD from a given number of adjacent entries in a
557 * list of BLOBNBOXs.
558 */
559
560 WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
561 int32_t blobcount, // no of blobs to use
562 bool bol, // start of line
563 uint8_t blanks // no of blanks
564 ) {
565 C_OUTLINE_IT cout_it;
566 C_BLOB_LIST cblobs;
567 C_BLOB_IT cblob_it = &cblobs;
568
569 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
570 auto bblob = box_it->extract();
571 if (bblob->joined_to_prev()) {
572 auto cblob = bblob->remove_cblob();
573 if (cblob != nullptr) {
574 cout_it.set_to_list(cblob_it.data()->out_list());
575 cout_it.move_to_last();
576 cout_it.add_list_after(cblob->out_list());
577 delete cblob;
578 }
579 } else {
580 auto cblob = bblob->remove_cblob();
581 if (cblob != nullptr) {
582 cblob_it.add_after_then_move(cblob);
583 }
584 }
585 delete bblob;
586 box_it->forward(); // next one
587 }
588
589 if (blanks < 1) {
590 blanks = 1;
591 }
592
593 auto word = new WERD(&cblobs, blanks, nullptr);
594
595 if (bol) {
596 word->set_flag(W_BOL, true);
597 }
598 if (box_it->at_first()) {
599 word->set_flag(W_EOL, true); // at end of line
600 }
601
602 return word;
603 }
604
605 } // namespace tesseract