Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ocrblock.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: ocrblock.cpp (Formerly block.c) | |
| 3 * Description: BLOCK member functions and iterator functions. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1991, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include "ocrblock.h" | |
| 20 | |
| 21 #include "stepblob.h" | |
| 22 #include "tprintf.h" | |
| 23 | |
| 24 #include <cstdlib> | |
| 25 #include <memory> // std::unique_ptr | |
| 26 | |
| 27 namespace tesseract { | |
| 28 | |
| 29 /** | |
| 30 * BLOCK::BLOCK | |
| 31 * | |
| 32 * Constructor for a simple rectangular block. | |
| 33 */ | |
| 34 BLOCK::BLOCK(const char *name, ///< filename | |
| 35 bool prop, ///< proportional | |
| 36 int16_t kern, ///< kerning | |
| 37 int16_t space, ///< spacing | |
| 38 TDimension xmin, ///< bottom left | |
| 39 TDimension ymin, | |
| 40 TDimension xmax, ///< top right | |
| 41 TDimension ymax) | |
| 42 : pdblk(xmin, ymin, xmax, ymax) | |
| 43 , filename(name) | |
| 44 , re_rotation_(1.0f, 0.0f) | |
| 45 , classify_rotation_(1.0f, 0.0f) | |
| 46 , skew_(1.0f, 0.0f) { | |
| 47 ICOORDELT_IT left_it = &pdblk.leftside; | |
| 48 ICOORDELT_IT right_it = &pdblk.rightside; | |
| 49 | |
| 50 proportional = prop; | |
| 51 kerning = kern; | |
| 52 spacing = space; | |
| 53 font_class = -1; // not assigned | |
| 54 cell_over_xheight_ = 2.0f; | |
| 55 pdblk.hand_poly = nullptr; | |
| 56 left_it.set_to_list(&pdblk.leftside); | |
| 57 right_it.set_to_list(&pdblk.rightside); | |
| 58 // make default box | |
| 59 left_it.add_to_end(new ICOORDELT(xmin, ymin)); | |
| 60 left_it.add_to_end(new ICOORDELT(xmin, ymax)); | |
| 61 right_it.add_to_end(new ICOORDELT(xmax, ymin)); | |
| 62 right_it.add_to_end(new ICOORDELT(xmax, ymax)); | |
| 63 } | |
| 64 | |
| 65 /** | |
| 66 * decreasing_top_order | |
| 67 * | |
| 68 * Sort Comparator: Return <0 if row1 top < row2 top | |
| 69 */ | |
| 70 | |
| 71 static int decreasing_top_order(const void *row1, const void *row2) { | |
| 72 return (*reinterpret_cast<ROW *const *>(row2))->bounding_box().top() - | |
| 73 (*reinterpret_cast<ROW *const *>(row1))->bounding_box().top(); | |
| 74 } | |
| 75 | |
| 76 /** | |
| 77 * BLOCK::rotate | |
| 78 * | |
| 79 * Rotate the polygon by the given rotation and recompute the bounding_box. | |
| 80 */ | |
| 81 void BLOCK::rotate(const FCOORD &rotation) { | |
| 82 pdblk.poly_block()->rotate(rotation); | |
| 83 pdblk.box = *pdblk.poly_block()->bounding_box(); | |
| 84 } | |
| 85 | |
| 86 // Returns the bounding box including the desired combination of upper and | |
| 87 // lower noise/diacritic elements. | |
| 88 TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const { | |
| 89 TBOX box; | |
| 90 // This is a read-only iteration of the rows in the block. | |
| 91 ROW_IT it(const_cast<ROW_LIST *>(&rows)); | |
| 92 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 93 box += it.data()->restricted_bounding_box(upper_dots, lower_dots); | |
| 94 } | |
| 95 return box; | |
| 96 } | |
| 97 | |
| 98 /** | |
| 99 * BLOCK::reflect_polygon_in_y_axis | |
| 100 * | |
| 101 * Reflects the polygon in the y-axis and recompute the bounding_box. | |
| 102 * Does nothing to any contained rows/words/blobs etc. | |
| 103 */ | |
| 104 void BLOCK::reflect_polygon_in_y_axis() { | |
| 105 pdblk.poly_block()->reflect_in_y_axis(); | |
| 106 pdblk.box = *pdblk.poly_block()->bounding_box(); | |
| 107 } | |
| 108 | |
| 109 /** | |
| 110 * BLOCK::sort_rows | |
| 111 * | |
| 112 * Order rows so that they are in order of decreasing Y coordinate | |
| 113 */ | |
| 114 | |
| 115 void BLOCK::sort_rows() { // order on "top" | |
| 116 ROW_IT row_it(&rows); | |
| 117 | |
| 118 row_it.sort(decreasing_top_order); | |
| 119 } | |
| 120 | |
| 121 /** | |
| 122 * BLOCK::compress | |
| 123 * | |
| 124 * Delete space between the rows. (And maybe one day, compress the rows) | |
| 125 * Fill space of block from top down, left aligning rows. | |
| 126 */ | |
| 127 | |
| 128 void BLOCK::compress() { // squash it up | |
| 129 #define ROW_SPACING 5 | |
| 130 | |
| 131 ROW_IT row_it(&rows); | |
| 132 ROW *row; | |
| 133 ICOORD row_spacing(0, ROW_SPACING); | |
| 134 | |
| 135 ICOORDELT_IT icoordelt_it; | |
| 136 | |
| 137 sort_rows(); | |
| 138 | |
| 139 pdblk.box = TBOX(pdblk.box.topleft(), pdblk.box.topleft()); | |
| 140 pdblk.box.move_bottom_edge(ROW_SPACING); | |
| 141 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 142 row = row_it.data(); | |
| 143 row->move(pdblk.box.botleft() - row_spacing - row->bounding_box().topleft()); | |
| 144 pdblk.box += row->bounding_box(); | |
| 145 } | |
| 146 | |
| 147 pdblk.leftside.clear(); | |
| 148 icoordelt_it.set_to_list(&pdblk.leftside); | |
| 149 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.bottom())); | |
| 150 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.top())); | |
| 151 pdblk.rightside.clear(); | |
| 152 icoordelt_it.set_to_list(&pdblk.rightside); | |
| 153 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.bottom())); | |
| 154 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.top())); | |
| 155 } | |
| 156 | |
| 157 /** | |
| 158 * BLOCK::check_pitch | |
| 159 * | |
| 160 * Check whether the block is fixed or prop, set the flag, and set | |
| 161 * the pitch if it is fixed. | |
| 162 */ | |
| 163 | |
| 164 void BLOCK::check_pitch() { // check prop | |
| 165 // tprintf("Missing FFT fixed pitch stuff!\n"); | |
| 166 pitch = -1; | |
| 167 } | |
| 168 | |
| 169 /** | |
| 170 * BLOCK::compress | |
| 171 * | |
| 172 * Compress and move in a single operation. | |
| 173 */ | |
| 174 | |
| 175 void BLOCK::compress( // squash it up | |
| 176 const ICOORD vec // and move | |
| 177 ) { | |
| 178 pdblk.box.move(vec); | |
| 179 compress(); | |
| 180 } | |
| 181 | |
| 182 /** | |
| 183 * BLOCK::print | |
| 184 * | |
| 185 * Print the info on a block | |
| 186 */ | |
| 187 | |
| 188 void BLOCK::print( // print list of sides | |
| 189 FILE *, ///< file to print on | |
| 190 bool dump ///< print full detail | |
| 191 ) { | |
| 192 ICOORDELT_IT it = &pdblk.leftside; // iterator | |
| 193 | |
| 194 pdblk.box.print(); | |
| 195 tprintf("Proportional= %s\n", proportional ? "TRUE" : "FALSE"); | |
| 196 tprintf("Kerning= %d\n", kerning); | |
| 197 tprintf("Spacing= %d\n", spacing); | |
| 198 tprintf("Fixed_pitch=%d\n", pitch); | |
| 199 tprintf("Filename= %s\n", filename.c_str()); | |
| 200 | |
| 201 if (dump) { | |
| 202 tprintf("Left side coords are:\n"); | |
| 203 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 204 tprintf("(%d,%d) ", it.data()->x(), it.data()->y()); | |
| 205 } | |
| 206 tprintf("\n"); | |
| 207 tprintf("Right side coords are:\n"); | |
| 208 it.set_to_list(&pdblk.rightside); | |
| 209 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 210 tprintf("(%d,%d) ", it.data()->x(), it.data()->y()); | |
| 211 } | |
| 212 tprintf("\n"); | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 /** | |
| 217 * BLOCK::operator= | |
| 218 * | |
| 219 * Assignment - duplicate the block structure, but with an EMPTY row list. | |
| 220 */ | |
| 221 | |
| 222 BLOCK &BLOCK::operator=( // assignment | |
| 223 const BLOCK &source // from this | |
| 224 ) { | |
| 225 this->ELIST_LINK::operator=(source); | |
| 226 pdblk = source.pdblk; | |
| 227 proportional = source.proportional; | |
| 228 kerning = source.kerning; | |
| 229 spacing = source.spacing; | |
| 230 filename = source.filename; // STRINGs assign ok | |
| 231 if (!rows.empty()) { | |
| 232 rows.clear(); | |
| 233 } | |
| 234 re_rotation_ = source.re_rotation_; | |
| 235 classify_rotation_ = source.classify_rotation_; | |
| 236 skew_ = source.skew_; | |
| 237 return *this; | |
| 238 } | |
| 239 | |
| 240 // This function is for finding the approximate (horizontal) distance from | |
| 241 // the x-coordinate of the left edge of a symbol to the left edge of the | |
| 242 // text block which contains it. We are passed: | |
| 243 // segments - output of PB_LINE_IT::get_line() which contains x-coordinate | |
| 244 // intervals for the scan line going through the symbol's y-coordinate. | |
| 245 // Each element of segments is of the form (x()=start_x, y()=length). | |
| 246 // x - the x coordinate of the symbol we're interested in. | |
| 247 // margin - return value, the distance from x,y to the left margin of the | |
| 248 // block containing it. | |
| 249 // If all segments were to the right of x, we return false and 0. | |
| 250 static bool LeftMargin(ICOORDELT_LIST *segments, int x, int *margin) { | |
| 251 bool found = false; | |
| 252 *margin = 0; | |
| 253 if (segments->empty()) { | |
| 254 return found; | |
| 255 } | |
| 256 ICOORDELT_IT seg_it(segments); | |
| 257 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { | |
| 258 int cur_margin = x - seg_it.data()->x(); | |
| 259 if (cur_margin >= 0) { | |
| 260 if (!found) { | |
| 261 *margin = cur_margin; | |
| 262 } else if (cur_margin < *margin) { | |
| 263 *margin = cur_margin; | |
| 264 } | |
| 265 found = true; | |
| 266 } | |
| 267 } | |
| 268 return found; | |
| 269 } | |
| 270 | |
| 271 // This function is for finding the approximate (horizontal) distance from | |
| 272 // the x-coordinate of the right edge of a symbol to the right edge of the | |
| 273 // text block which contains it. We are passed: | |
| 274 // segments - output of PB_LINE_IT::get_line() which contains x-coordinate | |
| 275 // intervals for the scan line going through the symbol's y-coordinate. | |
| 276 // Each element of segments is of the form (x()=start_x, y()=length). | |
| 277 // x - the x coordinate of the symbol we're interested in. | |
| 278 // margin - return value, the distance from x,y to the right margin of the | |
| 279 // block containing it. | |
| 280 // If all segments were to the left of x, we return false and 0. | |
| 281 static bool RightMargin(ICOORDELT_LIST *segments, int x, int *margin) { | |
| 282 bool found = false; | |
| 283 *margin = 0; | |
| 284 if (segments->empty()) { | |
| 285 return found; | |
| 286 } | |
| 287 ICOORDELT_IT seg_it(segments); | |
| 288 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { | |
| 289 int cur_margin = seg_it.data()->x() + seg_it.data()->y() - x; | |
| 290 if (cur_margin >= 0) { | |
| 291 if (!found) { | |
| 292 *margin = cur_margin; | |
| 293 } else if (cur_margin < *margin) { | |
| 294 *margin = cur_margin; | |
| 295 } | |
| 296 found = true; | |
| 297 } | |
| 298 } | |
| 299 return found; | |
| 300 } | |
| 301 | |
| 302 // Compute the distance from the left and right ends of each row to the | |
| 303 // left and right edges of the block's polyblock. Illustration: | |
| 304 // ____________________________ _______________________ | |
| 305 // | Howdy neighbor! | |rectangular blocks look| | |
| 306 // | This text is written to| |more like stacked pizza| | |
| 307 // |illustrate how useful poly- |boxes. | | |
| 308 // |blobs are in ----------- ------ The polyblob| | |
| 309 // |dealing with| _________ |for a BLOCK rec-| | |
| 310 // |harder layout| /===========\ |ords the possibly| | |
| 311 // |issues. | | _ _ | |skewed pseudo-| | |
| 312 // | You see this| | |_| \|_| | |rectangular | | |
| 313 // |text is flowed| | } | |boundary that| | |
| 314 // |around a mid-| \ ____ | |forms the ideal-| | |
| 315 // |column portrait._____ \ / __|ized text margin| | |
| 316 // | Polyblobs exist| \ / |from which we should| | |
| 317 // |to account for insets| | | |measure paragraph| | |
| 318 // |which make otherwise| ----- |indentation. | | |
| 319 // ----------------------- ---------------------- | |
| 320 // | |
| 321 // If we identify a drop-cap, we measure the left margin for the lines | |
| 322 // below the first line relative to one space past the drop cap. The | |
| 323 // first line's margin and those past the drop cap area are measured | |
| 324 // relative to the enclosing polyblock. | |
| 325 // | |
| 326 // TODO(rays): Before this will work well, we'll need to adjust the | |
| 327 // polyblob tighter around the text near images, as in: | |
| 328 // UNLV_AUTO:mag.3G0 page 2 | |
| 329 // UNLV_AUTO:mag.3G4 page 16 | |
| 330 void BLOCK::compute_row_margins() { | |
| 331 if (row_list()->empty() || row_list()->singleton()) { | |
| 332 return; | |
| 333 } | |
| 334 | |
| 335 // If Layout analysis was not called, default to this. | |
| 336 POLY_BLOCK rect_block(pdblk.bounding_box(), PT_FLOWING_TEXT); | |
| 337 POLY_BLOCK *pblock = &rect_block; | |
| 338 if (pdblk.poly_block() != nullptr) { | |
| 339 pblock = pdblk.poly_block(); | |
| 340 } | |
| 341 | |
| 342 // Step One: Determine if there is a drop-cap. | |
| 343 // TODO(eger): Fix up drop cap code for RTL languages. | |
| 344 ROW_IT r_it(row_list()); | |
| 345 ROW *first_row = r_it.data(); | |
| 346 ROW *second_row = r_it.data_relative(1); | |
| 347 | |
| 348 // initialize the bottom of a fictitious drop cap far above the first line. | |
| 349 int drop_cap_bottom = first_row->bounding_box().top() + first_row->bounding_box().height(); | |
| 350 int drop_cap_right = first_row->bounding_box().left(); | |
| 351 int mid_second_line = second_row->bounding_box().top() - second_row->bounding_box().height() / 2; | |
| 352 WERD_IT werd_it(r_it.data()->word_list()); // words of line one | |
| 353 if (!werd_it.empty()) { | |
| 354 C_BLOB_IT cblob_it(werd_it.data()->cblob_list()); | |
| 355 for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { | |
| 356 TBOX bbox = cblob_it.data()->bounding_box(); | |
| 357 if (bbox.bottom() <= mid_second_line) { | |
| 358 // we found a real drop cap | |
| 359 first_row->set_has_drop_cap(true); | |
| 360 if (drop_cap_bottom > bbox.bottom()) { | |
| 361 drop_cap_bottom = bbox.bottom(); | |
| 362 } | |
| 363 if (drop_cap_right < bbox.right()) { | |
| 364 drop_cap_right = bbox.right(); | |
| 365 } | |
| 366 } | |
| 367 } | |
| 368 } | |
| 369 | |
| 370 // Step Two: Calculate the margin from the text of each row to the block | |
| 371 // (or drop-cap) boundaries. | |
| 372 PB_LINE_IT lines(pblock); | |
| 373 r_it.set_to_list(row_list()); | |
| 374 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { | |
| 375 ROW *row = r_it.data(); | |
| 376 TBOX row_box = row->bounding_box(); | |
| 377 int left_y = row->base_line(row_box.left()) + row->x_height(); | |
| 378 int left_margin; | |
| 379 const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_left(lines.get_line(left_y)); | |
| 380 LeftMargin(segments_left.get(), row_box.left(), &left_margin); | |
| 381 | |
| 382 if (row_box.top() >= drop_cap_bottom) { | |
| 383 int drop_cap_distance = row_box.left() - row->space() - drop_cap_right; | |
| 384 if (drop_cap_distance < 0) { | |
| 385 drop_cap_distance = 0; | |
| 386 } | |
| 387 if (drop_cap_distance < left_margin) { | |
| 388 left_margin = drop_cap_distance; | |
| 389 } | |
| 390 } | |
| 391 | |
| 392 int right_y = row->base_line(row_box.right()) + row->x_height(); | |
| 393 int right_margin; | |
| 394 const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_right(lines.get_line(right_y)); | |
| 395 RightMargin(segments_right.get(), row_box.right(), &right_margin); | |
| 396 row->set_lmargin(left_margin); | |
| 397 row->set_rmargin(right_margin); | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 /********************************************************************** | |
| 402 * PrintSegmentationStats | |
| 403 * | |
| 404 * Prints segmentation stats for the given block list. | |
| 405 **********************************************************************/ | |
| 406 | |
| 407 void PrintSegmentationStats(BLOCK_LIST *block_list) { | |
| 408 int num_blocks = 0; | |
| 409 int num_rows = 0; | |
| 410 int num_words = 0; | |
| 411 int num_blobs = 0; | |
| 412 BLOCK_IT block_it(block_list); | |
| 413 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 414 BLOCK *block = block_it.data(); | |
| 415 ++num_blocks; | |
| 416 ROW_IT row_it(block->row_list()); | |
| 417 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 418 ++num_rows; | |
| 419 ROW *row = row_it.data(); | |
| 420 // Iterate over all werds in the row. | |
| 421 WERD_IT werd_it(row->word_list()); | |
| 422 for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { | |
| 423 WERD *werd = werd_it.data(); | |
| 424 ++num_words; | |
| 425 num_blobs += werd->cblob_list()->length(); | |
| 426 } | |
| 427 } | |
| 428 } | |
| 429 tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n", num_blocks, | |
| 430 num_rows, num_words, num_blobs); | |
| 431 } | |
| 432 | |
| 433 /********************************************************************** | |
| 434 * ExtractBlobsFromSegmentation | |
| 435 * | |
| 436 * Extracts blobs from the given block list and adds them to the output list. | |
| 437 * The block list must have been created by performing a page segmentation. | |
| 438 **********************************************************************/ | |
| 439 | |
| 440 void ExtractBlobsFromSegmentation(BLOCK_LIST *blocks, C_BLOB_LIST *output_blob_list) { | |
| 441 C_BLOB_IT return_list_it(output_blob_list); | |
| 442 BLOCK_IT block_it(blocks); | |
| 443 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 444 BLOCK *block = block_it.data(); | |
| 445 ROW_IT row_it(block->row_list()); | |
| 446 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 447 ROW *row = row_it.data(); | |
| 448 // Iterate over all werds in the row. | |
| 449 WERD_IT werd_it(row->word_list()); | |
| 450 for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { | |
| 451 WERD *werd = werd_it.data(); | |
| 452 return_list_it.move_to_last(); | |
| 453 return_list_it.add_list_after(werd->cblob_list()); | |
| 454 return_list_it.move_to_last(); | |
| 455 return_list_it.add_list_after(werd->rej_cblob_list()); | |
| 456 } | |
| 457 } | |
| 458 } | |
| 459 } | |
| 460 | |
| 461 /********************************************************************** | |
| 462 * RefreshWordBlobsFromNewBlobs() | |
| 463 * | |
| 464 * Refreshes the words in the block_list by using blobs in the | |
| 465 * new_blobs list. | |
| 466 * Block list must have word segmentation in it. | |
| 467 * It consumes the blobs provided in the new_blobs list. The blobs leftover in | |
| 468 * the new_blobs list after the call weren't matched to any blobs of the words | |
| 469 * in block list. | |
| 470 * The output not_found_blobs is a list of blobs from the original segmentation | |
| 471 * in the block_list for which no corresponding new blobs were found. | |
| 472 **********************************************************************/ | |
| 473 | |
| 474 void RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs, | |
| 475 C_BLOB_LIST *not_found_blobs) { | |
| 476 // Now iterate over all the blobs in the segmentation_block_list_, and just | |
| 477 // replace the corresponding c-blobs inside the werds. | |
| 478 BLOCK_IT block_it(block_list); | |
| 479 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 480 BLOCK *block = block_it.data(); | |
| 481 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { | |
| 482 continue; // Don't touch non-text blocks. | |
| 483 } | |
| 484 // Iterate over all rows in the block. | |
| 485 ROW_IT row_it(block->row_list()); | |
| 486 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 487 ROW *row = row_it.data(); | |
| 488 // Iterate over all werds in the row. | |
| 489 WERD_IT werd_it(row->word_list()); | |
| 490 WERD_LIST new_words; | |
| 491 WERD_IT new_words_it(&new_words); | |
| 492 for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { | |
| 493 WERD *werd = werd_it.extract(); | |
| 494 WERD *new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs); | |
| 495 if (new_werd) { | |
| 496 // Insert this new werd into the actual row's werd-list. Remove the | |
| 497 // existing one. | |
| 498 new_words_it.add_after_then_move(new_werd); | |
| 499 delete werd; | |
| 500 } else { | |
| 501 // Reinsert the older word back, for lack of better options. | |
| 502 // This is critical since dropping the words messes up segmentation: | |
| 503 // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on. | |
| 504 new_words_it.add_after_then_move(werd); | |
| 505 } | |
| 506 } | |
| 507 // Get rid of the old word list & replace it with the new one. | |
| 508 row->word_list()->clear(); | |
| 509 werd_it.move_to_first(); | |
| 510 werd_it.add_list_after(&new_words); | |
| 511 } | |
| 512 } | |
| 513 } | |
| 514 | |
| 515 } // namespace tesseract |
