Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/chopper.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 * | |
| 3 * File: chopper.cpp (Formerly chopper.c) | |
| 4 * Author: Mark Seaman, OCR Technology | |
| 5 * | |
| 6 * (c) Copyright 1987, Hewlett-Packard Company. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 *****************************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "blamer.h" // for BlamerBundle, IRR_CORRECT | |
| 25 #include "blobs.h" // for TPOINT, TBLOB, EDGEPT, TESSLINE, divisible_blob | |
| 26 #include "dict.h" // for Dict | |
| 27 #include "lm_pain_points.h" // for LMPainPoints | |
| 28 #include "lm_state.h" // for BestChoiceBundle | |
| 29 #include "matrix.h" // for MATRIX | |
| 30 #include "normalis.h" // for DENORM | |
| 31 #include "pageres.h" // for WERD_RES | |
| 32 #include "params.h" // for IntParam, BoolParam | |
| 33 #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST (ptr ... | |
| 34 #include "rect.h" // for TBOX | |
| 35 #include "render.h" // for display_blob | |
| 36 #include "seam.h" // for SEAM | |
| 37 #include "split.h" // for remove_edgept | |
| 38 #include "stopper.h" // for DANGERR | |
| 39 #include "tprintf.h" // for tprintf | |
| 40 #include "wordrec.h" // for Wordrec, SegSearchPending (ptr only) | |
| 41 | |
| 42 namespace tesseract { | |
| 43 | |
| 44 // Even though the limit on the number of chunks may now be removed, keep | |
| 45 // the same limit for repeatable behavior, and it may be a speed advantage. | |
| 46 static const int kMaxNumChunks = 64; | |
| 47 | |
| 48 /*---------------------------------------------------------------------- | |
| 49 F u n c t i o n s | |
| 50 ----------------------------------------------------------------------*/ | |
| 51 | |
| 52 /** | |
| 53 * @name check_blob | |
| 54 * | |
| 55 * @return true if blob has a non whole outline. | |
| 56 */ | |
| 57 static int check_blob(TBLOB *blob) { | |
| 58 TESSLINE *outline; | |
| 59 EDGEPT *edgept; | |
| 60 | |
| 61 for (outline = blob->outlines; outline != nullptr; outline = outline->next) { | |
| 62 edgept = outline->loop; | |
| 63 do { | |
| 64 if (edgept == nullptr) { | |
| 65 break; | |
| 66 } | |
| 67 edgept = edgept->next; | |
| 68 } while (edgept != outline->loop); | |
| 69 if (edgept == nullptr) { | |
| 70 return 1; | |
| 71 } | |
| 72 } | |
| 73 return 0; | |
| 74 } | |
| 75 | |
| 76 /** | |
| 77 * @name any_shared_split_points | |
| 78 * | |
| 79 * Return true if any of the splits share a point with this one. | |
| 80 */ | |
| 81 static int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) { | |
| 82 int length; | |
| 83 int index; | |
| 84 | |
| 85 length = seams.size(); | |
| 86 for (index = 0; index < length; index++) { | |
| 87 if (seam->SharesPosition(*seams[index])) { | |
| 88 return true; | |
| 89 } | |
| 90 } | |
| 91 return false; | |
| 92 } | |
| 93 | |
| 94 /** | |
| 95 * @name preserve_outline_tree | |
| 96 * | |
| 97 * Copy the list of outlines. | |
| 98 */ | |
| 99 static void preserve_outline(EDGEPT *start) { | |
| 100 EDGEPT *srcpt; | |
| 101 | |
| 102 if (start == nullptr) { | |
| 103 return; | |
| 104 } | |
| 105 srcpt = start; | |
| 106 do { | |
| 107 srcpt->runlength = 1; | |
| 108 srcpt = srcpt->next; | |
| 109 } while (srcpt != start); | |
| 110 srcpt->runlength = 2; | |
| 111 } | |
| 112 | |
| 113 static void preserve_outline_tree(TESSLINE *srcline) { | |
| 114 TESSLINE *outline; | |
| 115 | |
| 116 for (outline = srcline; outline != nullptr; outline = outline->next) { | |
| 117 preserve_outline(outline->loop); | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 /** | |
| 122 * @name restore_outline_tree | |
| 123 * | |
| 124 * Copy the list of outlines. | |
| 125 */ | |
| 126 static EDGEPT *restore_outline(EDGEPT *start) { | |
| 127 EDGEPT *srcpt; | |
| 128 EDGEPT *real_start; | |
| 129 | |
| 130 if (start == nullptr) { | |
| 131 return nullptr; | |
| 132 } | |
| 133 srcpt = start; | |
| 134 do { | |
| 135 if (srcpt->runlength == 2) { | |
| 136 break; | |
| 137 } | |
| 138 srcpt = srcpt->next; | |
| 139 } while (srcpt != start); | |
| 140 real_start = srcpt; | |
| 141 do { | |
| 142 srcpt = srcpt->next; | |
| 143 if (srcpt->prev->runlength == 0) { | |
| 144 remove_edgept(srcpt->prev); | |
| 145 } | |
| 146 } while (srcpt != real_start); | |
| 147 return real_start; | |
| 148 } | |
| 149 | |
| 150 static void restore_outline_tree(TESSLINE *srcline) { | |
| 151 TESSLINE *outline; | |
| 152 | |
| 153 for (outline = srcline; outline != nullptr; outline = outline->next) { | |
| 154 outline->loop = restore_outline(outline->loop); | |
| 155 outline->start = outline->loop->pos; | |
| 156 } | |
| 157 } | |
| 158 | |
| 159 /********************************************************************** | |
| 160 * total_containment | |
| 161 * | |
| 162 * Check to see if one of these outlines is totally contained within | |
| 163 * the bounding box of the other. | |
| 164 **********************************************************************/ | |
| 165 static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) { | |
| 166 TBOX box1 = blob1->bounding_box(); | |
| 167 TBOX box2 = blob2->bounding_box(); | |
| 168 return box1.contains(box2) || box2.contains(box1); | |
| 169 } | |
| 170 | |
| 171 // Helper runs all the checks on a seam to make sure it is valid. | |
| 172 // Returns the seam if OK, otherwise deletes the seam and returns nullptr. | |
| 173 static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob, | |
| 174 TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) { | |
| 175 if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr || | |
| 176 total_containment(blob, other_blob) || check_blob(other_blob) || | |
| 177 !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) || | |
| 178 any_shared_split_points(seams, seam) || | |
| 179 !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) { | |
| 180 word->blobs.erase(word->blobs.begin() + blob_number + 1); | |
| 181 if (seam) { | |
| 182 seam->UndoSeam(blob, other_blob); | |
| 183 delete seam; | |
| 184 seam = nullptr; | |
| 185 #ifndef GRAPHICS_DISABLED | |
| 186 if (debug_level) { | |
| 187 if (debug_level > 2) { | |
| 188 display_blob(blob, ScrollView::RED); | |
| 189 } | |
| 190 tprintf("\n** seam being removed ** \n"); | |
| 191 } | |
| 192 #endif | |
| 193 } else { | |
| 194 delete other_blob; | |
| 195 } | |
| 196 return nullptr; | |
| 197 } | |
| 198 return seam; | |
| 199 } | |
| 200 | |
| 201 /** | |
| 202 * @name attempt_blob_chop | |
| 203 * | |
| 204 * Try to split the this blob after this one. Check to make sure that | |
| 205 * it was successful. | |
| 206 */ | |
| 207 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, | |
| 208 const std::vector<SEAM *> &seams) { | |
| 209 if (repair_unchopped_blobs) { | |
| 210 preserve_outline_tree(blob->outlines); | |
| 211 } | |
| 212 TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ | |
| 213 // Insert it into the word. | |
| 214 word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); | |
| 215 | |
| 216 SEAM *seam = nullptr; | |
| 217 if (prioritize_division) { | |
| 218 TPOINT location; | |
| 219 if (divisible_blob(blob, italic_blob, &location)) { | |
| 220 seam = new SEAM(0.0f, location); | |
| 221 } | |
| 222 } | |
| 223 if (seam == nullptr) { | |
| 224 seam = pick_good_seam(blob); | |
| 225 } | |
| 226 if (chop_debug) { | |
| 227 if (seam != nullptr) { | |
| 228 seam->Print("Good seam picked="); | |
| 229 } else { | |
| 230 tprintf("\n** no seam picked *** \n"); | |
| 231 } | |
| 232 } | |
| 233 if (seam) { | |
| 234 seam->ApplySeam(italic_blob, blob, other_blob); | |
| 235 } | |
| 236 | |
| 237 seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); | |
| 238 if (seam == nullptr) { | |
| 239 if (repair_unchopped_blobs) { | |
| 240 restore_outline_tree(blob->outlines); | |
| 241 } | |
| 242 if (allow_blob_division && !prioritize_division) { | |
| 243 // If the blob can simply be divided into outlines, then do that. | |
| 244 TPOINT location; | |
| 245 if (divisible_blob(blob, italic_blob, &location)) { | |
| 246 other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ | |
| 247 word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); | |
| 248 seam = new SEAM(0.0f, location); | |
| 249 seam->ApplySeam(italic_blob, blob, other_blob); | |
| 250 seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); | |
| 251 } | |
| 252 } | |
| 253 } | |
| 254 if (seam != nullptr) { | |
| 255 // Make sure this seam doesn't get chopped again. | |
| 256 seam->Finalize(); | |
| 257 } | |
| 258 return seam; | |
| 259 } | |
| 260 | |
| 261 SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, | |
| 262 const std::vector<SEAM *> &seams) { | |
| 263 return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams); | |
| 264 } | |
| 265 | |
| 266 SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, | |
| 267 WERD_RES *word_res, unsigned *blob_number) { | |
| 268 TWERD *word = word_res->chopped_word; | |
| 269 for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) { | |
| 270 TBLOB *blob = word->blobs[*blob_number]; | |
| 271 TPOINT topleft, botright; | |
| 272 topleft.x = blob->bounding_box().left(); | |
| 273 topleft.y = blob->bounding_box().top(); | |
| 274 botright.x = blob->bounding_box().right(); | |
| 275 botright.y = blob->bounding_box().bottom(); | |
| 276 | |
| 277 TPOINT original_topleft, original_botright; | |
| 278 word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft); | |
| 279 word_res->denorm.DenormTransform(nullptr, botright, &original_botright); | |
| 280 | |
| 281 TBOX original_box = | |
| 282 TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y); | |
| 283 | |
| 284 bool almost_equal_box = false; | |
| 285 int num_overlap = 0; | |
| 286 for (auto &&boxe : boxes) { | |
| 287 if (original_box.overlap_fraction(boxe) > 0.125) { | |
| 288 num_overlap++; | |
| 289 } | |
| 290 if (original_box.almost_equal(boxe, 3)) { | |
| 291 almost_equal_box = true; | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 TPOINT location; | |
| 296 if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) { | |
| 297 SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, word_res->seam_array); | |
| 298 if (seam != nullptr) { | |
| 299 return seam; | |
| 300 } | |
| 301 } | |
| 302 } | |
| 303 | |
| 304 *blob_number = UINT_MAX; | |
| 305 return nullptr; | |
| 306 } | |
| 307 | |
| 308 /** | |
| 309 * @name improve_one_blob | |
| 310 * | |
| 311 * Finds the best place to chop, based on the worst blob, fixpt, or next to | |
| 312 * a fragment, according to the input. Returns the SEAM corresponding to the | |
| 313 * chop point, if any is found, and the index in the ratings_matrix of the | |
| 314 * chopped blob. Note that blob_choices is just a copy of the pointers in the | |
| 315 * leading diagonal of the ratings MATRIX. | |
| 316 * Although the blob is chopped, the returned SEAM is yet to be inserted into | |
| 317 * word->seam_array and the resulting blobs are unclassified, so this function | |
| 318 * can be used by ApplyBox as well as during recognition. | |
| 319 */ | |
| 320 SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, | |
| 321 bool split_next_to_fragment, bool italic_blob, WERD_RES *word, | |
| 322 unsigned *blob_number) { | |
| 323 float rating_ceiling = FLT_MAX; | |
| 324 SEAM *seam = nullptr; | |
| 325 do { | |
| 326 auto blob = select_blob_to_split_from_fixpt(fixpt); | |
| 327 if (chop_debug) { | |
| 328 tprintf("blob_number from fixpt = %d\n", blob); | |
| 329 } | |
| 330 bool split_point_from_dict = (blob != -1); | |
| 331 if (split_point_from_dict) { | |
| 332 fixpt->clear(); | |
| 333 } else { | |
| 334 blob = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment); | |
| 335 } | |
| 336 if (chop_debug) { | |
| 337 tprintf("blob_number = %d\n", blob); | |
| 338 } | |
| 339 *blob_number = blob; | |
| 340 if (blob == -1) { | |
| 341 return nullptr; | |
| 342 } | |
| 343 | |
| 344 // TODO(rays) it may eventually help to allow italic_blob to be true, | |
| 345 seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob, word->seam_array); | |
| 346 if (seam != nullptr) { | |
| 347 break; // Success! | |
| 348 } | |
| 349 if (blob_choices[*blob_number] == nullptr) { | |
| 350 return nullptr; | |
| 351 } | |
| 352 if (!split_point_from_dict) { | |
| 353 // We chopped the worst rated blob, try something else next time. | |
| 354 rating_ceiling = blob_choices[*blob_number]->rating(); | |
| 355 } | |
| 356 } while (true); | |
| 357 return seam; | |
| 358 } | |
| 359 | |
| 360 /** | |
| 361 * @name chop_one_blob | |
| 362 * | |
| 363 * Start with the current one-blob word and its classification. Find | |
| 364 * the worst blobs and try to divide it up to improve the ratings. | |
| 365 * Used for testing chopper. | |
| 366 */ | |
| 367 SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes, | |
| 368 const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, | |
| 369 unsigned *blob_number) { | |
| 370 if (prioritize_division) { | |
| 371 return chop_overlapping_blob(boxes, true, word_res, blob_number); | |
| 372 } else { | |
| 373 return improve_one_blob(blob_choices, nullptr, false, true, word_res, blob_number); | |
| 374 } | |
| 375 } | |
| 376 | |
| 377 /** | |
| 378 * @name chop_word_main | |
| 379 * | |
| 380 * Classify the blobs in this word and permute the results. Find the | |
| 381 * worst blob in the word and chop it up. Continue this process until | |
| 382 * a good answer has been found or all the blobs have been chopped up | |
| 383 * enough. The results are returned in the WERD_RES. | |
| 384 */ | |
| 385 void Wordrec::chop_word_main(WERD_RES *word) { | |
| 386 int num_blobs = word->chopped_word->NumBlobs(); | |
| 387 if (word->ratings == nullptr) { | |
| 388 word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks); | |
| 389 } | |
| 390 if (word->ratings->get(0, 0) == nullptr) { | |
| 391 // Run initial classification. | |
| 392 for (int b = 0; b < num_blobs; ++b) { | |
| 393 BLOB_CHOICE_LIST *choices = classify_piece( | |
| 394 word->seam_array, b, b, "Initial:", word->chopped_word, word->blamer_bundle); | |
| 395 word->ratings->put(b, b, choices); | |
| 396 } | |
| 397 } else { | |
| 398 // Blobs have been pre-classified. Set matrix cell for all blob choices | |
| 399 for (int col = 0; col < word->ratings->dimension(); ++col) { | |
| 400 for (int row = col; | |
| 401 row < word->ratings->dimension() && row < col + word->ratings->bandwidth(); ++row) { | |
| 402 BLOB_CHOICE_LIST *choices = word->ratings->get(col, row); | |
| 403 if (choices != nullptr) { | |
| 404 BLOB_CHOICE_IT bc_it(choices); | |
| 405 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { | |
| 406 bc_it.data()->set_matrix_cell(col, row); | |
| 407 } | |
| 408 } | |
| 409 } | |
| 410 } | |
| 411 } | |
| 412 | |
| 413 // Run Segmentation Search. | |
| 414 BestChoiceBundle best_choice_bundle(word->ratings->dimension()); | |
| 415 SegSearch(word, &best_choice_bundle, word->blamer_bundle); | |
| 416 | |
| 417 if (word->best_choice == nullptr) { | |
| 418 // SegSearch found no valid paths, so just use the leading diagonal. | |
| 419 word->FakeWordFromRatings(TOP_CHOICE_PERM); | |
| 420 } | |
| 421 word->RebuildBestState(); | |
| 422 // If we finished without a hyphen at the end of the word, let the next word | |
| 423 // be found in the dictionary. | |
| 424 if (word->word->flag(W_EOL) && !getDict().has_hyphen_end(*word->best_choice)) { | |
| 425 getDict().reset_hyphen_vars(true); | |
| 426 } | |
| 427 | |
| 428 if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) { | |
| 429 CallFillLattice(*word->ratings, word->best_choices, *word->uch_set, word->blamer_bundle); | |
| 430 } | |
| 431 if (wordrec_debug_level > 0) { | |
| 432 tprintf("Final Ratings Matrix:\n"); | |
| 433 word->ratings->print(getDict().getUnicharset()); | |
| 434 } | |
| 435 word->FilterWordChoices(getDict().stopper_debug_level); | |
| 436 } | |
| 437 | |
| 438 /** | |
| 439 * @name improve_by_chopping | |
| 440 * | |
| 441 * Repeatedly chops the worst blob, classifying the new blobs fixing up all | |
| 442 * the data, and incrementally runs the segmentation search until a good word | |
| 443 * is found, or no more chops can be found. | |
| 444 */ | |
| 445 void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word, | |
| 446 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, | |
| 447 LMPainPoints *pain_points, | |
| 448 std::vector<SegSearchPending> *pending) { | |
| 449 unsigned blob_number; | |
| 450 do { // improvement loop. | |
| 451 // Make a simple vector of BLOB_CHOICEs to make it easy to pick which | |
| 452 // one to chop. | |
| 453 std::vector<BLOB_CHOICE *> blob_choices; | |
| 454 int num_blobs = word->ratings->dimension(); | |
| 455 for (int i = 0; i < num_blobs; ++i) { | |
| 456 BLOB_CHOICE_LIST *choices = word->ratings->get(i, i); | |
| 457 if (choices == nullptr || choices->empty()) { | |
| 458 blob_choices.push_back(nullptr); | |
| 459 } else { | |
| 460 BLOB_CHOICE_IT bc_it(choices); | |
| 461 blob_choices.push_back(bc_it.data()); | |
| 462 } | |
| 463 } | |
| 464 SEAM *seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, | |
| 465 &blob_number); | |
| 466 if (seam == nullptr) { | |
| 467 break; | |
| 468 } | |
| 469 // A chop has been made. We have to correct all the data structures to | |
| 470 // take into account the extra bottom-level blob. | |
| 471 // Put the seam into the seam_array and correct everything else on the | |
| 472 // word: ratings matrix (including matrix location in the BLOB_CHOICES), | |
| 473 // states in WERD_CHOICEs, and blob widths. | |
| 474 word->InsertSeam(blob_number, seam); | |
| 475 // Insert a new entry in the beam array. | |
| 476 best_choice_bundle->beam.insert(best_choice_bundle->beam.begin() + blob_number, new LanguageModelState); | |
| 477 // Fixpts are outdated, but will get recalculated. | |
| 478 best_choice_bundle->fixpt.clear(); | |
| 479 // Remap existing pain points. | |
| 480 pain_points->RemapForSplit(blob_number); | |
| 481 // Insert a new pending at the chop point. | |
| 482 pending->insert(pending->begin() + blob_number, SegSearchPending()); | |
| 483 | |
| 484 // Classify the two newly created blobs using ProcessSegSearchPainPoint, | |
| 485 // as that updates the pending correctly and adds new pain points. | |
| 486 MATRIX_COORD pain_point(blob_number, blob_number); | |
| 487 ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); | |
| 488 pain_point.col = blob_number + 1; | |
| 489 pain_point.row = blob_number + 1; | |
| 490 ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); | |
| 491 if (language_model_->language_model_ngram_on) { | |
| 492 // N-gram evaluation depends on the number of blobs in a chunk, so we | |
| 493 // have to re-evaluate everything in the word. | |
| 494 ResetNGramSearch(word, best_choice_bundle, *pending); | |
| 495 blob_number = 0; | |
| 496 } | |
| 497 // Run language model incrementally. (Except with the n-gram model on.) | |
| 498 UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, | |
| 499 best_choice_bundle, blamer_bundle); | |
| 500 } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); | |
| 501 | |
| 502 // If after running only the chopper best_choice is incorrect and no blame | |
| 503 // has been yet set, blame the classifier if best_choice is classifier's | |
| 504 // top choice and is a dictionary word (i.e. language model could not have | |
| 505 // helped). Otherwise blame the tradeoff between the classifier and | |
| 506 // the old language model (permuters). | |
| 507 if (word->blamer_bundle != nullptr && | |
| 508 word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && | |
| 509 !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { | |
| 510 bool valid_permuter = word->best_choice != nullptr && | |
| 511 Dict::valid_word_permuter(word->best_choice->permuter(), false); | |
| 512 word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, | |
| 513 wordrec_debug_blamer); | |
| 514 } | |
| 515 } | |
| 516 | |
| 517 /********************************************************************** | |
| 518 * select_blob_to_split | |
| 519 * | |
| 520 * These are the results of the last classification. Find a likely | |
| 521 * place to apply splits. If none, return -1. | |
| 522 **********************************************************************/ | |
| 523 int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, | |
| 524 float rating_ceiling, bool split_next_to_fragment) { | |
| 525 BLOB_CHOICE *blob_choice; | |
| 526 float worst = -FLT_MAX; | |
| 527 int worst_index = -1; | |
| 528 float worst_near_fragment = -FLT_MAX; | |
| 529 int worst_index_near_fragment = -1; | |
| 530 std::vector<const CHAR_FRAGMENT *> fragments; | |
| 531 | |
| 532 if (chop_debug) { | |
| 533 if (rating_ceiling < FLT_MAX) { | |
| 534 tprintf("rating_ceiling = %8.4f\n", rating_ceiling); | |
| 535 } else { | |
| 536 tprintf("rating_ceiling = No Limit\n"); | |
| 537 } | |
| 538 } | |
| 539 | |
| 540 if (split_next_to_fragment && blob_choices.size() > 0) { | |
| 541 fragments.resize(blob_choices.size()); | |
| 542 if (blob_choices[0] != nullptr) { | |
| 543 fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id()); | |
| 544 } else { | |
| 545 fragments[0] = nullptr; | |
| 546 } | |
| 547 } | |
| 548 | |
| 549 for (unsigned x = 0; x < blob_choices.size(); ++x) { | |
| 550 if (blob_choices[x] == nullptr) { | |
| 551 return x; | |
| 552 } else { | |
| 553 blob_choice = blob_choices[x]; | |
| 554 // Populate fragments for the following position. | |
| 555 if (split_next_to_fragment && x + 1 < blob_choices.size()) { | |
| 556 if (blob_choices[x + 1] != nullptr) { | |
| 557 fragments[x + 1] = | |
| 558 getDict().getUnicharset().get_fragment(blob_choices[x + 1]->unichar_id()); | |
| 559 } else { | |
| 560 fragments[x + 1] = nullptr; | |
| 561 } | |
| 562 } | |
| 563 if (blob_choice->rating() < rating_ceiling && | |
| 564 blob_choice->certainty() < tessedit_certainty_threshold) { | |
| 565 // Update worst and worst_index. | |
| 566 if (blob_choice->rating() > worst) { | |
| 567 worst_index = x; | |
| 568 worst = blob_choice->rating(); | |
| 569 } | |
| 570 if (split_next_to_fragment) { | |
| 571 // Update worst_near_fragment and worst_index_near_fragment. | |
| 572 bool expand_following_fragment = | |
| 573 (x + 1 < blob_choices.size() && fragments[x + 1] != nullptr && | |
| 574 !fragments[x + 1]->is_beginning()); | |
| 575 bool expand_preceding_fragment = | |
| 576 (x > 0 && fragments[x - 1] != nullptr && !fragments[x - 1]->is_ending()); | |
| 577 if ((expand_following_fragment || expand_preceding_fragment) && | |
| 578 blob_choice->rating() > worst_near_fragment) { | |
| 579 worst_index_near_fragment = x; | |
| 580 worst_near_fragment = blob_choice->rating(); | |
| 581 if (chop_debug) { | |
| 582 tprintf( | |
| 583 "worst_index_near_fragment=%d" | |
| 584 " expand_following_fragment=%d" | |
| 585 " expand_preceding_fragment=%d\n", | |
| 586 worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment); | |
| 587 } | |
| 588 } | |
| 589 } | |
| 590 } | |
| 591 } | |
| 592 } | |
| 593 // TODO(daria): maybe a threshold of badness for | |
| 594 // worst_near_fragment would be useful. | |
| 595 return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index; | |
| 596 } | |
| 597 | |
| 598 /********************************************************************** | |
| 599 * select_blob_to_split_from_fixpt | |
| 600 * | |
| 601 * Given the fix point from a dictionary search, if there is a single | |
| 602 * dangerous blob that maps to multiple characters, return that blob | |
| 603 * index as a place we need to split. If none, return -1. | |
| 604 **********************************************************************/ | |
| 605 int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) { | |
| 606 if (!fixpt) { | |
| 607 return -1; | |
| 608 } | |
| 609 for (auto &i : *fixpt) { | |
| 610 if (i.begin + 1 == i.end && i.dangerous && i.correct_is_ngram) { | |
| 611 return i.begin; | |
| 612 } | |
| 613 } | |
| 614 return -1; | |
| 615 } | |
| 616 | |
| 617 } // namespace tesseract |
