Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/wordseg.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: wordseg.cpp (Formerly wspace.c) | |
| 3 * Description: Code to segment the blobs into words. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "wordseg.h" | |
| 25 | |
| 26 #include <cmath> | |
| 27 | |
| 28 #include "blobbox.h" | |
| 29 #include "cjkpitch.h" | |
| 30 #include "drawtord.h" | |
| 31 #include "fpchop.h" | |
| 32 #include "makerow.h" | |
| 33 #include "pitsync1.h" | |
| 34 #include "statistc.h" | |
| 35 #include "textord.h" | |
| 36 #include "topitch.h" | |
| 37 #include "tovars.h" | |
| 38 | |
| 39 namespace tesseract { | |
| 40 | |
| 41 BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows"); | |
| 42 BOOL_VAR(textord_chopper_test, false, "Chopper is being tested."); | |
| 43 | |
| 44 #define BLOCK_STATS_CLUSTERS 10 | |
| 45 | |
| 46 /** | |
| 47 * @name make_single_word | |
| 48 * | |
| 49 * For each row, arrange the blobs into one word. There is no fixed | |
| 50 * pitch detection. | |
| 51 */ | |
| 52 | |
| 53 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) { | |
| 54 TO_ROW_IT to_row_it(rows); | |
| 55 ROW_IT row_it(real_rows); | |
| 56 for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) { | |
| 57 TO_ROW *row = to_row_it.data(); | |
| 58 // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready | |
| 59 // to create the word. | |
| 60 C_BLOB_LIST cblobs; | |
| 61 C_BLOB_IT cblob_it(&cblobs); | |
| 62 BLOBNBOX_IT box_it(row->blob_list()); | |
| 63 for (; !box_it.empty(); box_it.forward()) { | |
| 64 BLOBNBOX *bblob = box_it.extract(); | |
| 65 if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { | |
| 66 auto cblob = bblob->remove_cblob(); | |
| 67 if (cblob != nullptr) { | |
| 68 C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); | |
| 69 cout_it.move_to_last(); | |
| 70 cout_it.add_list_after(cblob->out_list()); | |
| 71 delete cblob; | |
| 72 } | |
| 73 } else { | |
| 74 auto cblob = bblob->remove_cblob(); | |
| 75 if (cblob != nullptr) { | |
| 76 cblob_it.add_after_then_move(cblob); | |
| 77 } | |
| 78 } | |
| 79 delete bblob; | |
| 80 } | |
| 81 // Convert the TO_ROW to a ROW. | |
| 82 ROW *real_row = | |
| 83 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); | |
| 84 WERD_IT word_it(real_row->word_list()); | |
| 85 WERD *word = new WERD(&cblobs, 0, nullptr); | |
| 86 word->set_flag(W_BOL, true); | |
| 87 word->set_flag(W_EOL, true); | |
| 88 word->set_flag(W_DONT_CHOP, one_blob); | |
| 89 word_it.add_after_then_move(word); | |
| 90 real_row->recalc_bounding_box(); | |
| 91 row_it.add_after_then_move(real_row); | |
| 92 } | |
| 93 } | |
| 94 | |
| 95 /** | |
| 96 * make_words | |
| 97 * | |
| 98 * Arrange the blobs into words. | |
| 99 */ | |
| 100 void make_words(tesseract::Textord *textord, | |
| 101 ICOORD page_tr, // top right | |
| 102 float gradient, // page skew | |
| 103 BLOCK_LIST *blocks, // block list | |
| 104 TO_BLOCK_LIST *port_blocks) { // output list | |
| 105 TO_BLOCK_IT block_it; // iterator | |
| 106 TO_BLOCK *block; // current block | |
| 107 | |
| 108 if (textord->use_cjk_fp_model()) { | |
| 109 compute_fixed_pitch_cjk(page_tr, port_blocks); | |
| 110 } else { | |
| 111 compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f), | |
| 112 !bool(textord_test_landscape)); | |
| 113 } | |
| 114 textord->to_spacing(page_tr, port_blocks); | |
| 115 block_it.set_to_list(port_blocks); | |
| 116 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 117 block = block_it.data(); | |
| 118 make_real_words(textord, block, FCOORD(1.0f, 0.0f)); | |
| 119 } | |
| 120 } | |
| 121 | |
| 122 /** | |
| 123 * @name set_row_spaces | |
| 124 * | |
| 125 * Set the min_space and max_nonspace members of the row so that | |
| 126 * the blobs can be arranged into words. | |
| 127 */ | |
| 128 | |
| 129 void set_row_spaces( // find space sizes | |
| 130 TO_BLOCK *block, // block to do | |
| 131 FCOORD rotation, // for drawing | |
| 132 bool testing_on // correct orientation | |
| 133 ) { | |
| 134 TO_ROW *row; // current row | |
| 135 TO_ROW_IT row_it = block->get_rows(); | |
| 136 | |
| 137 if (row_it.empty()) { | |
| 138 return; // empty block | |
| 139 } | |
| 140 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 141 row = row_it.data(); | |
| 142 if (row->fixed_pitch == 0) { | |
| 143 row->min_space = static_cast<int32_t>( | |
| 144 ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); | |
| 145 row->max_nonspace = static_cast<int32_t>( | |
| 146 floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); | |
| 147 if (testing_on && textord_show_initial_words) { | |
| 148 tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace, | |
| 149 row->min_space, row->intercept()); | |
| 150 } | |
| 151 row->space_threshold = (row->max_nonspace + row->min_space) / 2; | |
| 152 row->space_size = row->pr_space; | |
| 153 row->kern_size = row->pr_nonsp; | |
| 154 } | |
| 155 #ifndef GRAPHICS_DISABLED | |
| 156 if (textord_show_initial_words && testing_on) { | |
| 157 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row); | |
| 158 } | |
| 159 #endif | |
| 160 } | |
| 161 } | |
| 162 | |
| 163 /** | |
| 164 * @name row_words | |
| 165 * | |
| 166 * Compute the max nonspace and min space for the row. | |
| 167 */ | |
| 168 | |
| 169 int32_t row_words( // compute space size | |
| 170 TO_BLOCK *block, // block it came from | |
| 171 TO_ROW *row, // row to operate on | |
| 172 int32_t maxwidth, // max expected space size | |
| 173 FCOORD rotation, // for drawing | |
| 174 bool testing_on // for debug | |
| 175 ) { | |
| 176 bool testing_row; // contains testpt | |
| 177 bool prev_valid; // if decent size | |
| 178 int32_t prev_x; // end of prev blob | |
| 179 int32_t cluster_count; // no of clusters | |
| 180 int32_t gap_index; // which cluster | |
| 181 int32_t smooth_factor; // for smoothing stats | |
| 182 BLOBNBOX *blob; // current blob | |
| 183 float lower, upper; // clustering parameters | |
| 184 float gaps[3]; // gap clusers | |
| 185 ICOORD testpt; | |
| 186 TBOX blob_box; // bounding box | |
| 187 // iterator | |
| 188 BLOBNBOX_IT blob_it = row->blob_list(); | |
| 189 STATS gap_stats(0, maxwidth - 1); | |
| 190 STATS cluster_stats[4]; // clusters | |
| 191 | |
| 192 testpt = ICOORD(textord_test_x, textord_test_y); | |
| 193 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); | |
| 194 // if (testing_on) | |
| 195 // tprintf("Row smooth factor=%d\n",smooth_factor); | |
| 196 prev_valid = false; | |
| 197 prev_x = -INT32_MAX; | |
| 198 testing_row = false; | |
| 199 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 200 blob = blob_it.data(); | |
| 201 blob_box = blob->bounding_box(); | |
| 202 if (blob_box.contains(testpt)) { | |
| 203 testing_row = true; | |
| 204 } | |
| 205 gap_stats.add(blob_box.width(), 1); | |
| 206 } | |
| 207 gap_stats.clear(); | |
| 208 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 209 blob = blob_it.data(); | |
| 210 if (!blob->joined_to_prev()) { | |
| 211 blob_box = blob->bounding_box(); | |
| 212 if (prev_valid && blob_box.left() - prev_x < maxwidth) { | |
| 213 gap_stats.add(blob_box.left() - prev_x, 1); | |
| 214 } | |
| 215 prev_valid = true; | |
| 216 prev_x = blob_box.right(); | |
| 217 } | |
| 218 } | |
| 219 if (gap_stats.get_total() == 0) { | |
| 220 row->min_space = 0; // no evidence | |
| 221 row->max_nonspace = 0; | |
| 222 return 0; | |
| 223 } | |
| 224 gap_stats.smooth(smooth_factor); | |
| 225 lower = row->xheight * textord_words_initial_lower; | |
| 226 upper = row->xheight * textord_words_initial_upper; | |
| 227 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); | |
| 228 while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) { | |
| 229 // shrink gap | |
| 230 upper = (upper * 3 + lower) / 4; | |
| 231 lower = (lower * 3 + upper) / 4; | |
| 232 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); | |
| 233 } | |
| 234 if (cluster_count < 2) { | |
| 235 row->min_space = 0; // no evidence | |
| 236 row->max_nonspace = 0; | |
| 237 return 0; | |
| 238 } | |
| 239 for (gap_index = 0; gap_index < cluster_count; gap_index++) { | |
| 240 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); | |
| 241 } | |
| 242 // get medians | |
| 243 if (cluster_count > 2) { | |
| 244 if (testing_on && textord_show_initial_words) { | |
| 245 tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(), | |
| 246 cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5)); | |
| 247 } | |
| 248 lower = gaps[0]; | |
| 249 if (gaps[1] > lower) { | |
| 250 upper = gaps[1]; // prefer most frequent | |
| 251 if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) { | |
| 252 upper = gaps[2]; | |
| 253 } | |
| 254 } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) { | |
| 255 upper = gaps[2]; | |
| 256 } else if (lower >= block->xheight * textord_words_min_minspace) { | |
| 257 upper = lower; // not nice | |
| 258 lower = gaps[1]; | |
| 259 if (testing_on && textord_show_initial_words) { | |
| 260 tprintf("Had to switch most common from lower to upper!!\n"); | |
| 261 gap_stats.print(); | |
| 262 } | |
| 263 } else { | |
| 264 row->min_space = 0; // no evidence | |
| 265 row->max_nonspace = 0; | |
| 266 return 0; | |
| 267 } | |
| 268 } else { | |
| 269 if (gaps[1] < gaps[0]) { | |
| 270 if (testing_on && textord_show_initial_words) { | |
| 271 tprintf("Had to switch most common from lower to upper!!\n"); | |
| 272 gap_stats.print(); | |
| 273 } | |
| 274 lower = gaps[1]; | |
| 275 upper = gaps[0]; | |
| 276 } else { | |
| 277 upper = gaps[1]; | |
| 278 lower = gaps[0]; | |
| 279 } | |
| 280 } | |
| 281 if (upper < block->xheight * textord_words_min_minspace) { | |
| 282 row->min_space = 0; // no evidence | |
| 283 row->max_nonspace = 0; | |
| 284 return 0; | |
| 285 } | |
| 286 if (upper * 3 < block->min_space * 2 + block->max_nonspace || | |
| 287 lower * 3 > block->min_space * 2 + block->max_nonspace) { | |
| 288 if (testing_on && textord_show_initial_words) { | |
| 289 tprintf("Disagreement between block and row at %g!!\n", row->intercept()); | |
| 290 tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper); | |
| 291 gap_stats.print(); | |
| 292 } | |
| 293 } | |
| 294 row->min_space = | |
| 295 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread)); | |
| 296 row->max_nonspace = | |
| 297 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread)); | |
| 298 row->space_threshold = (row->max_nonspace + row->min_space) / 2; | |
| 299 row->space_size = upper; | |
| 300 row->kern_size = lower; | |
| 301 if (testing_on && textord_show_initial_words) { | |
| 302 if (testing_row) { | |
| 303 tprintf("GAP STATS\n"); | |
| 304 gap_stats.print(); | |
| 305 tprintf("SPACE stats\n"); | |
| 306 cluster_stats[2].print_summary(); | |
| 307 tprintf("NONSPACE stats\n"); | |
| 308 cluster_stats[1].print_summary(); | |
| 309 } | |
| 310 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space, | |
| 311 upper, row->max_nonspace, lower); | |
| 312 } | |
| 313 return cluster_stats[2].get_total(); | |
| 314 } | |
| 315 | |
| 316 /** | |
| 317 * @name row_words2 | |
| 318 * | |
| 319 * Compute the max nonspace and min space for the row. | |
| 320 */ | |
| 321 | |
| 322 int32_t row_words2( // compute space size | |
| 323 TO_BLOCK *block, // block it came from | |
| 324 TO_ROW *row, // row to operate on | |
| 325 int32_t maxwidth, // max expected space size | |
| 326 FCOORD rotation, // for drawing | |
| 327 bool testing_on // for debug | |
| 328 ) { | |
| 329 bool prev_valid; // if decent size | |
| 330 bool this_valid; // current blob big enough | |
| 331 int32_t prev_x; // end of prev blob | |
| 332 int32_t min_width; // min interesting width | |
| 333 int32_t valid_count; // good gaps | |
| 334 int32_t total_count; // total gaps | |
| 335 int32_t cluster_count; // no of clusters | |
| 336 int32_t prev_count; // previous cluster_count | |
| 337 int32_t gap_index; // which cluster | |
| 338 int32_t smooth_factor; // for smoothing stats | |
| 339 BLOBNBOX *blob; // current blob | |
| 340 float lower, upper; // clustering parameters | |
| 341 ICOORD testpt; | |
| 342 TBOX blob_box; // bounding box | |
| 343 // iterator | |
| 344 BLOBNBOX_IT blob_it = row->blob_list(); | |
| 345 STATS gap_stats(0, maxwidth - 1); | |
| 346 // gap sizes | |
| 347 float gaps[BLOCK_STATS_CLUSTERS]; | |
| 348 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; | |
| 349 // clusters | |
| 350 | |
| 351 testpt = ICOORD(textord_test_x, textord_test_y); | |
| 352 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); | |
| 353 // if (testing_on) | |
| 354 // tprintf("Row smooth factor=%d\n",smooth_factor); | |
| 355 prev_valid = false; | |
| 356 prev_x = -INT16_MAX; | |
| 357 const bool testing_row = false; | |
| 358 // min blob size | |
| 359 min_width = static_cast<int32_t>(block->pr_space); | |
| 360 total_count = 0; | |
| 361 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 362 blob = blob_it.data(); | |
| 363 if (!blob->joined_to_prev()) { | |
| 364 blob_box = blob->bounding_box(); | |
| 365 this_valid = blob_box.width() >= min_width; | |
| 366 if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) { | |
| 367 gap_stats.add(blob_box.left() - prev_x, 1); | |
| 368 } | |
| 369 total_count++; // count possibles | |
| 370 prev_x = blob_box.right(); | |
| 371 prev_valid = this_valid; | |
| 372 } | |
| 373 } | |
| 374 valid_count = gap_stats.get_total(); | |
| 375 if (valid_count < total_count * textord_words_minlarge) { | |
| 376 gap_stats.clear(); | |
| 377 prev_x = -INT16_MAX; | |
| 378 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 379 blob = blob_it.data(); | |
| 380 if (!blob->joined_to_prev()) { | |
| 381 blob_box = blob->bounding_box(); | |
| 382 if (blob_box.left() - prev_x < maxwidth) { | |
| 383 gap_stats.add(blob_box.left() - prev_x, 1); | |
| 384 } | |
| 385 prev_x = blob_box.right(); | |
| 386 } | |
| 387 } | |
| 388 } | |
| 389 if (gap_stats.get_total() == 0) { | |
| 390 row->min_space = 0; // no evidence | |
| 391 row->max_nonspace = 0; | |
| 392 return 0; | |
| 393 } | |
| 394 | |
| 395 cluster_count = 0; | |
| 396 lower = block->xheight * words_initial_lower; | |
| 397 upper = block->xheight * words_initial_upper; | |
| 398 gap_stats.smooth(smooth_factor); | |
| 399 do { | |
| 400 prev_count = cluster_count; | |
| 401 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, | |
| 402 BLOCK_STATS_CLUSTERS, cluster_stats); | |
| 403 } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); | |
| 404 if (cluster_count < 1) { | |
| 405 row->min_space = 0; | |
| 406 row->max_nonspace = 0; | |
| 407 return 0; | |
| 408 } | |
| 409 for (gap_index = 0; gap_index < cluster_count; gap_index++) { | |
| 410 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); | |
| 411 } | |
| 412 // get medians | |
| 413 if (testing_on) { | |
| 414 tprintf("cluster_count=%d:", cluster_count); | |
| 415 for (gap_index = 0; gap_index < cluster_count; gap_index++) { | |
| 416 tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total()); | |
| 417 } | |
| 418 tprintf("\n"); | |
| 419 } | |
| 420 | |
| 421 // Try to find proportional non-space and space for row. | |
| 422 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace; | |
| 423 gap_index++) { | |
| 424 ; | |
| 425 } | |
| 426 if (gap_index < cluster_count) { | |
| 427 lower = gaps[gap_index]; // most frequent below | |
| 428 } else { | |
| 429 if (testing_on) { | |
| 430 tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp); | |
| 431 } | |
| 432 lower = block->pr_nonsp; | |
| 433 } | |
| 434 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace; | |
| 435 gap_index++) { | |
| 436 ; | |
| 437 } | |
| 438 if (gap_index < cluster_count) { | |
| 439 upper = gaps[gap_index]; // most frequent above | |
| 440 } else { | |
| 441 if (testing_on) { | |
| 442 tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space); | |
| 443 } | |
| 444 upper = block->pr_space; | |
| 445 } | |
| 446 row->min_space = | |
| 447 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread)); | |
| 448 row->max_nonspace = | |
| 449 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread)); | |
| 450 row->space_threshold = (row->max_nonspace + row->min_space) / 2; | |
| 451 row->space_size = upper; | |
| 452 row->kern_size = lower; | |
| 453 if (testing_on) { | |
| 454 if (testing_row) { | |
| 455 tprintf("GAP STATS\n"); | |
| 456 gap_stats.print(); | |
| 457 tprintf("SPACE stats\n"); | |
| 458 cluster_stats[2].print_summary(); | |
| 459 tprintf("NONSPACE stats\n"); | |
| 460 cluster_stats[1].print_summary(); | |
| 461 } | |
| 462 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space, | |
| 463 upper, row->max_nonspace, lower); | |
| 464 } | |
| 465 return 1; | |
| 466 } | |
| 467 | |
| 468 /** | |
| 469 * @name make_real_words | |
| 470 * | |
| 471 * Convert a TO_BLOCK to a BLOCK. | |
| 472 */ | |
| 473 | |
| 474 void make_real_words(tesseract::Textord *textord, | |
| 475 TO_BLOCK *block, // block to do | |
| 476 FCOORD rotation // for drawing | |
| 477 ) { | |
| 478 TO_ROW *row; // current row | |
| 479 TO_ROW_IT row_it = block->get_rows(); | |
| 480 ROW *real_row = nullptr; // output row | |
| 481 ROW_IT real_row_it = block->block->row_list(); | |
| 482 | |
| 483 if (row_it.empty()) { | |
| 484 return; // empty block | |
| 485 } | |
| 486 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 487 row = row_it.data(); | |
| 488 if (row->blob_list()->empty() && !row->rep_words.empty()) { | |
| 489 real_row = make_rep_words(row, block); | |
| 490 } else if (!row->blob_list()->empty()) { | |
| 491 // In a fixed pitch document, some lines may be detected as fixed pitch | |
| 492 // while others don't, and will go through different path. | |
| 493 // For non-space delimited language like CJK, fixed pitch chop always | |
| 494 // leave the entire line as one word. We can force consistent chopping | |
| 495 // with force_make_prop_words flag. | |
| 496 POLY_BLOCK *pb = block->block->pdblk.poly_block(); | |
| 497 if (textord_chopper_test) { | |
| 498 real_row = textord->make_blob_words(row, rotation); | |
| 499 } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) || | |
| 500 row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) { | |
| 501 real_row = textord->make_prop_words(row, rotation); | |
| 502 } else if (row->pitch_decision == PITCH_DEF_FIXED || | |
| 503 row->pitch_decision == PITCH_CORR_FIXED) { | |
| 504 real_row = fixed_pitch_words(row, rotation); | |
| 505 } else { | |
| 506 ASSERT_HOST(false); | |
| 507 } | |
| 508 } | |
| 509 if (real_row != nullptr) { | |
| 510 // put row in block | |
| 511 real_row_it.add_after_then_move(real_row); | |
| 512 } | |
| 513 } | |
| 514 block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size), | |
| 515 static_cast<int16_t>(block->space_size), | |
| 516 static_cast<int16_t>(block->fixed_pitch)); | |
| 517 block->block->check_pitch(); | |
| 518 } | |
| 519 | |
| 520 /** | |
| 521 * @name make_rep_words | |
| 522 * | |
| 523 * Fabricate a real row from only the repeated blob words. | |
| 524 * Get the xheight from the block as it may be more meaningful. | |
| 525 */ | |
| 526 | |
| 527 ROW *make_rep_words( // make a row | |
| 528 TO_ROW *row, // row to convert | |
| 529 TO_BLOCK *block // block it lives in | |
| 530 ) { | |
| 531 ROW *real_row; // output row | |
| 532 TBOX word_box; // bounding box | |
| 533 // iterator | |
| 534 WERD_IT word_it = &row->rep_words; | |
| 535 | |
| 536 if (word_it.empty()) { | |
| 537 return nullptr; | |
| 538 } | |
| 539 word_box = word_it.data()->bounding_box(); | |
| 540 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 541 word_box += word_it.data()->bounding_box(); | |
| 542 } | |
| 543 row->xheight = block->xheight; | |
| 544 real_row = | |
| 545 new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size)); | |
| 546 word_it.set_to_list(real_row->word_list()); | |
| 547 // put words in row | |
| 548 word_it.add_list_after(&row->rep_words); | |
| 549 real_row->recalc_bounding_box(); | |
| 550 return real_row; | |
| 551 } | |
| 552 | |
| 553 /** | |
| 554 * @name make_real_word | |
| 555 * | |
| 556 * Construct a WERD from a given number of adjacent entries in a | |
| 557 * list of BLOBNBOXs. | |
| 558 */ | |
| 559 | |
| 560 WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator | |
| 561 int32_t blobcount, // no of blobs to use | |
| 562 bool bol, // start of line | |
| 563 uint8_t blanks // no of blanks | |
| 564 ) { | |
| 565 C_OUTLINE_IT cout_it; | |
| 566 C_BLOB_LIST cblobs; | |
| 567 C_BLOB_IT cblob_it = &cblobs; | |
| 568 | |
| 569 for (int blobindex = 0; blobindex < blobcount; blobindex++) { | |
| 570 auto bblob = box_it->extract(); | |
| 571 if (bblob->joined_to_prev()) { | |
| 572 auto cblob = bblob->remove_cblob(); | |
| 573 if (cblob != nullptr) { | |
| 574 cout_it.set_to_list(cblob_it.data()->out_list()); | |
| 575 cout_it.move_to_last(); | |
| 576 cout_it.add_list_after(cblob->out_list()); | |
| 577 delete cblob; | |
| 578 } | |
| 579 } else { | |
| 580 auto cblob = bblob->remove_cblob(); | |
| 581 if (cblob != nullptr) { | |
| 582 cblob_it.add_after_then_move(cblob); | |
| 583 } | |
| 584 } | |
| 585 delete bblob; | |
| 586 box_it->forward(); // next one | |
| 587 } | |
| 588 | |
| 589 if (blanks < 1) { | |
| 590 blanks = 1; | |
| 591 } | |
| 592 | |
| 593 auto word = new WERD(&cblobs, blanks, nullptr); | |
| 594 | |
| 595 if (bol) { | |
| 596 word->set_flag(W_BOL, true); | |
| 597 } | |
| 598 if (box_it->at_first()) { | |
| 599 word->set_flag(W_EOL, true); // at end of line | |
| 600 } | |
| 601 | |
| 602 return word; | |
| 603 } | |
| 604 | |
| 605 } // namespace tesseract |
