Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/werd.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: werd.cpp (Formerly word.c) | |
| 3 * Description: Code for the WERD class. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1991, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "werd.h" | |
| 25 | |
| 26 #include "linlsq.h" | |
| 27 | |
| 28 #include "helpers.h" | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 #define FIRST_COLOUR ScrollView::RED ///< first rainbow colour | |
| 33 #define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour | |
| 34 #define CHILD_COLOUR ScrollView::BROWN ///< colour of children | |
| 35 | |
| 36 /** | |
| 37 * WERD::WERD | |
| 38 * | |
| 39 * Constructor to build a WERD from a list of C_BLOBs. | |
| 40 * blob_list The C_BLOBs (in word order) are not copied; | |
| 41 * we take its elements and put them in our lists. | |
| 42 * blank_count blanks in front of the word | |
| 43 * text correct text, outlives this WERD | |
| 44 */ | |
| 45 WERD::WERD(C_BLOB_LIST *blob_list, uint8_t blank_count, const char *text) | |
| 46 : blanks(blank_count), flags(0), script_id_(0), correct(text ? text : "") { | |
| 47 C_BLOB_IT start_it = &cblobs; | |
| 48 C_BLOB_IT rej_cblob_it = &rej_cblobs; | |
| 49 C_OUTLINE_IT c_outline_it; | |
| 50 int16_t inverted_vote = 0; | |
| 51 int16_t non_inverted_vote = 0; | |
| 52 | |
| 53 // Move blob_list's elements into cblobs. | |
| 54 start_it.add_list_after(blob_list); | |
| 55 | |
| 56 /* | |
| 57 Set white on black flag for the WERD, moving any duff blobs onto the | |
| 58 rej_cblobs list. | |
| 59 First, walk the cblobs checking the inverse flag for each outline of each | |
| 60 cblob. If a cblob has inconsistent flag settings for its different | |
| 61 outlines, move the blob to the reject list. Otherwise, increment the | |
| 62 appropriate w-on-b or b-on-w vote for the word. | |
| 63 | |
| 64 Now set the inversion flag for the WERD by maximum vote. | |
| 65 | |
| 66 Walk the blobs again, moving any blob whose inversion flag does not agree | |
| 67 with the concencus onto the reject list. | |
| 68 */ | |
| 69 start_it.set_to_list(&cblobs); | |
| 70 if (start_it.empty()) { | |
| 71 return; | |
| 72 } | |
| 73 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { | |
| 74 bool reject_blob = false; | |
| 75 bool blob_inverted; | |
| 76 | |
| 77 c_outline_it.set_to_list(start_it.data()->out_list()); | |
| 78 blob_inverted = c_outline_it.data()->flag(COUT_INVERSE); | |
| 79 for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob; | |
| 80 c_outline_it.forward()) { | |
| 81 reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted; | |
| 82 } | |
| 83 if (reject_blob) { | |
| 84 rej_cblob_it.add_after_then_move(start_it.extract()); | |
| 85 } else { | |
| 86 if (blob_inverted) { | |
| 87 inverted_vote++; | |
| 88 } else { | |
| 89 non_inverted_vote++; | |
| 90 } | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 flags.set(W_INVERSE, (inverted_vote > non_inverted_vote)); | |
| 95 | |
| 96 start_it.set_to_list(&cblobs); | |
| 97 if (start_it.empty()) { | |
| 98 return; | |
| 99 } | |
| 100 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { | |
| 101 c_outline_it.set_to_list(start_it.data()->out_list()); | |
| 102 if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) { | |
| 103 rej_cblob_it.add_after_then_move(start_it.extract()); | |
| 104 } | |
| 105 } | |
| 106 } | |
| 107 | |
| 108 /** | |
| 109 * WERD::WERD | |
| 110 * | |
| 111 * Constructor to build a WERD from a list of C_BLOBs. | |
| 112 * The C_BLOBs are not copied so the source list is emptied. | |
| 113 */ | |
| 114 | |
| 115 WERD::WERD(C_BLOB_LIST *blob_list, ///< In word order | |
| 116 WERD *clone) ///< Source of flags | |
| 117 : flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) { | |
| 118 C_BLOB_IT start_it = blob_list; // iterator | |
| 119 C_BLOB_IT end_it = blob_list; // another | |
| 120 | |
| 121 while (!end_it.at_last()) { | |
| 122 end_it.forward(); // move to last | |
| 123 } | |
| 124 cblobs.assign_to_sublist(&start_it, &end_it); | |
| 125 // move to our list | |
| 126 blanks = clone->blanks; | |
| 127 // fprintf(stderr,"Wrong constructor!!!!\n"); | |
| 128 } | |
| 129 | |
| 130 // Construct a WERD from a single_blob and clone the flags from this. | |
| 131 // W_BOL and W_EOL flags are set according to the given values. | |
| 132 WERD *WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob) { | |
| 133 C_BLOB_LIST temp_blobs; | |
| 134 C_BLOB_IT temp_it(&temp_blobs); | |
| 135 temp_it.add_after_then_move(blob); | |
| 136 WERD *blob_word = new WERD(&temp_blobs, this); | |
| 137 blob_word->set_flag(W_BOL, bol); | |
| 138 blob_word->set_flag(W_EOL, eol); | |
| 139 return blob_word; | |
| 140 } | |
| 141 | |
| 142 /** | |
| 143 * WERD::bounding_box | |
| 144 * | |
| 145 * Return the bounding box of the WERD. | |
| 146 * This is quite a mess to compute! | |
| 147 * ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the | |
| 148 * words on the row were re-sorted. The original words were built with reject | |
| 149 * blobs included. The FUZZY SPACE flags were set accordingly. If ALL the | |
| 150 * blobs in a word are rejected the BB for the word is nullptr, causing the sort | |
| 151 * to screw up, leading to the erroneous possibility of the first word in a | |
| 152 * row being marked as FUZZY space. | |
| 153 */ | |
| 154 | |
| 155 TBOX WERD::bounding_box() const { | |
| 156 return restricted_bounding_box(true, true); | |
| 157 } | |
| 158 | |
| 159 // Returns the bounding box including the desired combination of upper and | |
| 160 // lower noise/diacritic elements. | |
| 161 TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { | |
| 162 TBOX box = true_bounding_box(); | |
| 163 int bottom = box.bottom(); | |
| 164 int top = box.top(); | |
| 165 // This is a read-only iteration of the rejected blobs. | |
| 166 C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs)); | |
| 167 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 168 TBOX dot_box = it.data()->bounding_box(); | |
| 169 if ((upper_dots || dot_box.bottom() <= top) && (lower_dots || dot_box.top() >= bottom)) { | |
| 170 box += dot_box; | |
| 171 } | |
| 172 } | |
| 173 return box; | |
| 174 } | |
| 175 | |
| 176 // Returns the bounding box of only the good blobs. | |
| 177 TBOX WERD::true_bounding_box() const { | |
| 178 TBOX box; // box being built | |
| 179 // This is a read-only iteration of the good blobs. | |
| 180 C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs)); | |
| 181 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 182 box += it.data()->bounding_box(); | |
| 183 } | |
| 184 return box; | |
| 185 } | |
| 186 | |
| 187 /** | |
| 188 * WERD::move | |
| 189 * | |
| 190 * Reposition WERD by vector | |
| 191 * NOTE!! REJECT CBLOBS ARE NOT MOVED | |
| 192 */ | |
| 193 | |
| 194 void WERD::move(const ICOORD vec) { | |
| 195 C_BLOB_IT cblob_it(&cblobs); // cblob iterator | |
| 196 | |
| 197 for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { | |
| 198 cblob_it.data()->move(vec); | |
| 199 } | |
| 200 } | |
| 201 | |
| 202 /** | |
| 203 * WERD::join_on | |
| 204 * | |
| 205 * Join other word onto this one. Delete the old word. | |
| 206 */ | |
| 207 | |
| 208 void WERD::join_on(WERD *other) { | |
| 209 C_BLOB_IT blob_it(&cblobs); | |
| 210 C_BLOB_IT src_it(&other->cblobs); | |
| 211 C_BLOB_IT rej_cblob_it(&rej_cblobs); | |
| 212 C_BLOB_IT src_rej_it(&other->rej_cblobs); | |
| 213 | |
| 214 while (!src_it.empty()) { | |
| 215 blob_it.add_to_end(src_it.extract()); | |
| 216 src_it.forward(); | |
| 217 } | |
| 218 while (!src_rej_it.empty()) { | |
| 219 rej_cblob_it.add_to_end(src_rej_it.extract()); | |
| 220 src_rej_it.forward(); | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 /** | |
| 225 * WERD::copy_on | |
| 226 * | |
| 227 * Copy blobs from other word onto this one. | |
| 228 */ | |
| 229 | |
| 230 void WERD::copy_on(WERD *other) { | |
| 231 bool reversed = other->bounding_box().left() < bounding_box().left(); | |
| 232 C_BLOB_IT c_blob_it(&cblobs); | |
| 233 C_BLOB_LIST c_blobs; | |
| 234 | |
| 235 c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy); | |
| 236 if (reversed) { | |
| 237 c_blob_it.add_list_before(&c_blobs); | |
| 238 } else { | |
| 239 c_blob_it.move_to_last(); | |
| 240 c_blob_it.add_list_after(&c_blobs); | |
| 241 } | |
| 242 if (!other->rej_cblobs.empty()) { | |
| 243 C_BLOB_IT rej_c_blob_it(&rej_cblobs); | |
| 244 C_BLOB_LIST new_rej_c_blobs; | |
| 245 | |
| 246 new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy); | |
| 247 if (reversed) { | |
| 248 rej_c_blob_it.add_list_before(&new_rej_c_blobs); | |
| 249 } else { | |
| 250 rej_c_blob_it.move_to_last(); | |
| 251 rej_c_blob_it.add_list_after(&new_rej_c_blobs); | |
| 252 } | |
| 253 } | |
| 254 } | |
| 255 | |
| 256 /** | |
| 257 * WERD::print | |
| 258 * | |
| 259 * Display members | |
| 260 */ | |
| 261 | |
| 262 void WERD::print() const { | |
| 263 tprintf("Blanks= %d\n", blanks); | |
| 264 bounding_box().print(); | |
| 265 tprintf("Flags = %lu = 0%lo\n", flags.to_ulong(), flags.to_ulong()); | |
| 266 tprintf(" W_SEGMENTED = %s\n", flags[W_SEGMENTED] ? "TRUE" : "FALSE"); | |
| 267 tprintf(" W_ITALIC = %s\n", flags[W_ITALIC] ? "TRUE" : "FALSE"); | |
| 268 tprintf(" W_BOL = %s\n", flags[W_BOL] ? "TRUE" : "FALSE"); | |
| 269 tprintf(" W_EOL = %s\n", flags[W_EOL] ? "TRUE" : "FALSE"); | |
| 270 tprintf(" W_NORMALIZED = %s\n", flags[W_NORMALIZED] ? "TRUE" : "FALSE"); | |
| 271 tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", flags[W_SCRIPT_HAS_XHEIGHT] ? "TRUE" : "FALSE"); | |
| 272 tprintf(" W_SCRIPT_IS_LATIN = %s\n", flags[W_SCRIPT_IS_LATIN] ? "TRUE" : "FALSE"); | |
| 273 tprintf(" W_DONT_CHOP = %s\n", flags[W_DONT_CHOP] ? "TRUE" : "FALSE"); | |
| 274 tprintf(" W_REP_CHAR = %s\n", flags[W_REP_CHAR] ? "TRUE" : "FALSE"); | |
| 275 tprintf(" W_FUZZY_SP = %s\n", flags[W_FUZZY_SP] ? "TRUE" : "FALSE"); | |
| 276 tprintf(" W_FUZZY_NON = %s\n", flags[W_FUZZY_NON] ? "TRUE" : "FALSE"); | |
| 277 tprintf("Correct= %s\n", correct.c_str()); | |
| 278 tprintf("Rejected cblob count = %d\n", rej_cblobs.length()); | |
| 279 tprintf("Script = %d\n", script_id_); | |
| 280 } | |
| 281 | |
| 282 /** | |
| 283 * WERD::plot | |
| 284 * | |
| 285 * Draw the WERD in the given colour. | |
| 286 */ | |
| 287 | |
| 288 #ifndef GRAPHICS_DISABLED | |
| 289 void WERD::plot(ScrollView *window, ScrollView::Color colour) { | |
| 290 C_BLOB_IT it = &cblobs; | |
| 291 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 292 it.data()->plot(window, colour, colour); | |
| 293 } | |
| 294 plot_rej_blobs(window); | |
| 295 } | |
| 296 | |
| 297 // Get the next color in the (looping) rainbow. | |
| 298 ScrollView::Color WERD::NextColor(ScrollView::Color colour) { | |
| 299 auto next = static_cast<ScrollView::Color>(colour + 1); | |
| 300 if (next >= LAST_COLOUR || next < FIRST_COLOUR) { | |
| 301 next = FIRST_COLOUR; | |
| 302 } | |
| 303 return next; | |
| 304 } | |
| 305 | |
| 306 /** | |
| 307 * WERD::plot | |
| 308 * | |
| 309 * Draw the WERD in rainbow colours in window. | |
| 310 */ | |
| 311 | |
| 312 void WERD::plot(ScrollView *window) { | |
| 313 ScrollView::Color colour = FIRST_COLOUR; | |
| 314 C_BLOB_IT it = &cblobs; | |
| 315 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 316 it.data()->plot(window, colour, CHILD_COLOUR); | |
| 317 colour = NextColor(colour); | |
| 318 } | |
| 319 plot_rej_blobs(window); | |
| 320 } | |
| 321 | |
| 322 /** | |
| 323 * WERD::plot_rej_blobs | |
| 324 * | |
| 325 * Draw the WERD rejected blobs in window - ALWAYS GREY | |
| 326 */ | |
| 327 | |
| 328 void WERD::plot_rej_blobs(ScrollView *window) { | |
| 329 C_BLOB_IT it = &rej_cblobs; | |
| 330 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 331 it.data()->plot(window, ScrollView::GREY, ScrollView::GREY); | |
| 332 } | |
| 333 } | |
| 334 #endif // !GRAPHICS_DISABLED | |
| 335 | |
| 336 /** | |
| 337 * WERD::shallow_copy() | |
| 338 * | |
| 339 * Make a shallow copy of a word | |
| 340 */ | |
| 341 | |
| 342 WERD *WERD::shallow_copy() { | |
| 343 WERD *new_word = new WERD; | |
| 344 | |
| 345 new_word->blanks = blanks; | |
| 346 new_word->flags = flags; | |
| 347 new_word->correct = correct; | |
| 348 return new_word; | |
| 349 } | |
| 350 | |
| 351 /** | |
| 352 * WERD::operator= | |
| 353 * | |
| 354 * Assign a word, DEEP copying the blob list | |
| 355 */ | |
| 356 | |
| 357 WERD &WERD::operator=(const WERD &source) { | |
| 358 this->ELIST2_LINK::operator=(source); | |
| 359 blanks = source.blanks; | |
| 360 flags = source.flags; | |
| 361 script_id_ = source.script_id_; | |
| 362 correct = source.correct; | |
| 363 cblobs.clear(); | |
| 364 cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy); | |
| 365 rej_cblobs.clear(); | |
| 366 rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy); | |
| 367 return *this; | |
| 368 } | |
| 369 | |
| 370 /** | |
| 371 * word_comparator() | |
| 372 * | |
| 373 * word comparator used to sort a word list so that words are in increasing | |
| 374 * order of left edge. | |
| 375 */ | |
| 376 | |
| 377 int word_comparator(const void *word1p, const void *word2p) { | |
| 378 const WERD *word1 = *reinterpret_cast<const WERD *const *>(word1p); | |
| 379 const WERD *word2 = *reinterpret_cast<const WERD *const *>(word2p); | |
| 380 return word1->bounding_box().left() - word2->bounding_box().left(); | |
| 381 } | |
| 382 | |
| 383 /** | |
| 384 * WERD::ConstructWerdWithNewBlobs() | |
| 385 * | |
| 386 * This method returns a new werd constructed using the blobs in the input | |
| 387 * all_blobs list, which correspond to the blobs in this werd object. The | |
| 388 * blobs used to construct the new word are consumed and removed from the | |
| 389 * input all_blobs list. | |
| 390 * Returns nullptr if the word couldn't be constructed. | |
| 391 * Returns original blobs for which no matches were found in the output list | |
| 392 * orphan_blobs (appends). | |
| 393 */ | |
| 394 | |
| 395 WERD *WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs) { | |
| 396 C_BLOB_LIST current_blob_list; | |
| 397 C_BLOB_IT werd_blobs_it(¤t_blob_list); | |
| 398 // Add the word's c_blobs. | |
| 399 werd_blobs_it.add_list_after(cblob_list()); | |
| 400 | |
| 401 // New blob list. These contain the blobs which will form the new word. | |
| 402 C_BLOB_LIST new_werd_blobs; | |
| 403 C_BLOB_IT new_blobs_it(&new_werd_blobs); | |
| 404 | |
| 405 // not_found_blobs contains the list of current word's blobs for which a | |
| 406 // corresponding blob wasn't found in the input all_blobs list. | |
| 407 C_BLOB_LIST not_found_blobs; | |
| 408 C_BLOB_IT not_found_it(¬_found_blobs); | |
| 409 not_found_it.move_to_last(); | |
| 410 | |
| 411 werd_blobs_it.move_to_first(); | |
| 412 for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) { | |
| 413 C_BLOB *werd_blob = werd_blobs_it.extract(); | |
| 414 TBOX werd_blob_box = werd_blob->bounding_box(); | |
| 415 bool found = false; | |
| 416 // Now find the corresponding blob for this blob in the all_blobs | |
| 417 // list. For now, follow the inefficient method of pairwise | |
| 418 // comparisons. Ideally, one can pre-bucket the blobs by row. | |
| 419 C_BLOB_IT all_blobs_it(all_blobs); | |
| 420 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) { | |
| 421 C_BLOB *a_blob = all_blobs_it.data(); | |
| 422 // Compute the overlap of the two blobs. If major, a_blob should | |
| 423 // be added to the new blobs list. | |
| 424 TBOX a_blob_box = a_blob->bounding_box(); | |
| 425 if (a_blob_box.null_box()) { | |
| 426 tprintf("Bounding box couldn't be ascertained\n"); | |
| 427 } | |
| 428 if (werd_blob_box.contains(a_blob_box) || werd_blob_box.major_overlap(a_blob_box)) { | |
| 429 // Old blobs are from minimal splits, therefore are expected to be | |
| 430 // bigger. The new small blobs should cover a significant portion. | |
| 431 // This is it. | |
| 432 all_blobs_it.extract(); | |
| 433 new_blobs_it.add_after_then_move(a_blob); | |
| 434 found = true; | |
| 435 } | |
| 436 } | |
| 437 if (!found) { | |
| 438 not_found_it.add_after_then_move(werd_blob); | |
| 439 } else { | |
| 440 delete werd_blob; | |
| 441 } | |
| 442 } | |
| 443 // Iterate over all not found blobs. Some of them may be due to | |
| 444 // under-segmentation (which is OK, since the corresponding blob is already | |
| 445 // in the list in that case. | |
| 446 not_found_it.move_to_first(); | |
| 447 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) { | |
| 448 C_BLOB *not_found = not_found_it.data(); | |
| 449 TBOX not_found_box = not_found->bounding_box(); | |
| 450 C_BLOB_IT existing_blobs_it(new_blobs_it); | |
| 451 for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list(); | |
| 452 existing_blobs_it.forward()) { | |
| 453 C_BLOB *a_blob = existing_blobs_it.data(); | |
| 454 TBOX a_blob_box = a_blob->bounding_box(); | |
| 455 if ((not_found_box.major_overlap(a_blob_box) || a_blob_box.major_overlap(not_found_box)) && | |
| 456 not_found_box.y_overlap_fraction(a_blob_box) > 0.8) { | |
| 457 // Already taken care of. | |
| 458 delete not_found_it.extract(); | |
| 459 break; | |
| 460 } | |
| 461 } | |
| 462 } | |
| 463 if (orphan_blobs) { | |
| 464 C_BLOB_IT orphan_blobs_it(orphan_blobs); | |
| 465 orphan_blobs_it.move_to_last(); | |
| 466 orphan_blobs_it.add_list_after(¬_found_blobs); | |
| 467 } | |
| 468 | |
| 469 // New blobs are ready. Create a new werd object with these. | |
| 470 WERD *new_werd = nullptr; | |
| 471 if (!new_werd_blobs.empty()) { | |
| 472 new_werd = new WERD(&new_werd_blobs, this); | |
| 473 } else { | |
| 474 // Add the blobs back to this word so that it can be reused. | |
| 475 C_BLOB_IT this_list_it(cblob_list()); | |
| 476 this_list_it.add_list_after(¬_found_blobs); | |
| 477 } | |
| 478 return new_werd; | |
| 479 } | |
| 480 | |
| 481 // Removes noise from the word by moving small outlines to the rej_cblobs | |
| 482 // list, based on the size_threshold. | |
| 483 void WERD::CleanNoise(float size_threshold) { | |
| 484 C_BLOB_IT blob_it(&cblobs); | |
| 485 C_BLOB_IT rej_it(&rej_cblobs); | |
| 486 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 487 C_BLOB *blob = blob_it.data(); | |
| 488 C_OUTLINE_IT ol_it(blob->out_list()); | |
| 489 for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { | |
| 490 C_OUTLINE *outline = ol_it.data(); | |
| 491 TBOX ol_box = outline->bounding_box(); | |
| 492 int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); | |
| 493 if (ol_size < size_threshold) { | |
| 494 // This outline is too small. Move it to a separate blob in the | |
| 495 // reject blobs list. | |
| 496 auto *rej_blob = new C_BLOB(ol_it.extract()); | |
| 497 rej_it.add_after_then_move(rej_blob); | |
| 498 } | |
| 499 } | |
| 500 if (blob->out_list()->empty()) { | |
| 501 delete blob_it.extract(); | |
| 502 } | |
| 503 } | |
| 504 } | |
| 505 | |
| 506 // Extracts all the noise outlines and stuffs the pointers into the given | |
| 507 // vector of outlines. Afterwards, the outlines vector owns the pointers. | |
| 508 void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) { | |
| 509 C_BLOB_IT rej_it(&rej_cblobs); | |
| 510 for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { | |
| 511 C_BLOB *blob = rej_it.extract(); | |
| 512 C_OUTLINE_IT ol_it(blob->out_list()); | |
| 513 outlines->push_back(ol_it.extract()); | |
| 514 delete blob; | |
| 515 } | |
| 516 } | |
| 517 | |
| 518 // Adds the selected outlines to the indcated real blobs, and puts the rest | |
| 519 // back in rej_cblobs where they came from. Where the target_blobs entry is | |
| 520 // nullptr, a run of wanted outlines is put into a single new blob. | |
| 521 // Ownership of the outlines is transferred back to the word. (Hence | |
| 522 // vector and not PointerVector.) | |
| 523 // Returns true if any new blob was added to the start of the word, which | |
| 524 // suggests that it might need joining to the word before it, and likewise | |
| 525 // sets make_next_word_fuzzy true if any new blob was added to the end. | |
| 526 bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted, | |
| 527 const std::vector<C_BLOB *> &target_blobs, | |
| 528 const std::vector<C_OUTLINE *> &outlines, | |
| 529 bool *make_next_word_fuzzy) { | |
| 530 bool outline_added_to_start = false; | |
| 531 if (make_next_word_fuzzy != nullptr) { | |
| 532 *make_next_word_fuzzy = false; | |
| 533 } | |
| 534 C_BLOB_IT rej_it(&rej_cblobs); | |
| 535 for (unsigned i = 0; i < outlines.size(); ++i) { | |
| 536 C_OUTLINE *outline = outlines[i]; | |
| 537 if (outline == nullptr) { | |
| 538 continue; // Already used it. | |
| 539 } | |
| 540 if (wanted[i]) { | |
| 541 C_BLOB *target_blob = target_blobs[i]; | |
| 542 TBOX noise_box = outline->bounding_box(); | |
| 543 if (target_blob == nullptr) { | |
| 544 target_blob = new C_BLOB(outline); | |
| 545 // Need to find the insertion point. | |
| 546 C_BLOB_IT blob_it(&cblobs); | |
| 547 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 548 C_BLOB *blob = blob_it.data(); | |
| 549 TBOX blob_box = blob->bounding_box(); | |
| 550 if (blob_box.left() > noise_box.left()) { | |
| 551 if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { | |
| 552 // We might want to join this word to its predecessor. | |
| 553 outline_added_to_start = true; | |
| 554 } | |
| 555 blob_it.add_before_stay_put(target_blob); | |
| 556 break; | |
| 557 } | |
| 558 } | |
| 559 if (blob_it.cycled_list()) { | |
| 560 blob_it.add_to_end(target_blob); | |
| 561 if (make_next_word_fuzzy != nullptr) { | |
| 562 *make_next_word_fuzzy = true; | |
| 563 } | |
| 564 } | |
| 565 // Add all consecutive wanted, but null-blob outlines to same blob. | |
| 566 C_OUTLINE_IT ol_it(target_blob->out_list()); | |
| 567 while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) { | |
| 568 ++i; | |
| 569 ol_it.add_to_end(outlines[i]); | |
| 570 } | |
| 571 } else { | |
| 572 // Insert outline into this blob. | |
| 573 C_OUTLINE_IT ol_it(target_blob->out_list()); | |
| 574 ol_it.add_to_end(outline); | |
| 575 } | |
| 576 } else { | |
| 577 // Put back on noise list. | |
| 578 rej_it.add_to_end(new C_BLOB(outline)); | |
| 579 } | |
| 580 } | |
| 581 return outline_added_to_start; | |
| 582 } | |
| 583 | |
| 584 } // namespace tesseract |
