Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: pageres.cpp (Formerly page_res.c) | |
| 3 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES | |
| 4 * and an iterator class to iterate over the words. | |
| 5 * Main purposes: | |
| 6 * Easy way to iterate over the words without a 3-nested loop. | |
| 7 * Holds data used during word recognition. | |
| 8 * Holds information about alternative spacing paths. | |
| 9 * Author: Phil Cheatle | |
| 10 * | |
| 11 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 12 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 13 ** you may not use this file except in compliance with the License. | |
| 14 ** You may obtain a copy of the License at | |
| 15 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 16 ** Unless required by applicable law or agreed to in writing, software | |
| 17 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 18 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 19 ** See the License for the specific language governing permissions and | |
| 20 ** limitations under the License. | |
| 21 * | |
| 22 **********************************************************************/ | |
| 23 | |
| 24 #include "pageres.h" | |
| 25 | |
| 26 #include "blamer.h" // for BlamerBundle | |
| 27 #include "blobs.h" // for TWERD, TBLOB | |
| 28 #include "boxword.h" // for BoxWord | |
| 29 #include "errcode.h" // for ASSERT_HOST | |
| 30 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only) | |
| 31 #include "ocrrow.h" // for ROW, ROW_IT | |
| 32 #include "pdblock.h" // for PDBLK | |
| 33 #include "polyblk.h" // for POLY_BLOCK | |
| 34 #include "seam.h" // for SEAM, start_seam_list | |
| 35 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST | |
| 36 #include "tprintf.h" // for tprintf | |
| 37 | |
| 38 #include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY | |
| 39 | |
| 40 #include <cassert> // for assert | |
| 41 #include <cstdint> // for INT32_MAX | |
| 42 #include <cstring> // for strlen | |
| 43 | |
| 44 struct Pix; | |
| 45 | |
| 46 namespace tesseract { | |
| 47 | |
| 48 // Gain factor for computing thresholds that determine the ambiguity of a | |
| 49 // word. | |
| 50 static const double kStopperAmbiguityThresholdGain = 8.0; | |
| 51 // Constant offset for computing thresholds that determine the ambiguity of a | |
| 52 // word. | |
| 53 static const double kStopperAmbiguityThresholdOffset = 1.5; | |
| 54 // Max number of broken pieces to associate. | |
| 55 const int kWordrecMaxNumJoinChunks = 4; | |
| 56 // Max ratio of word box height to line size to allow it to be processed as | |
| 57 // a line with other words. | |
| 58 const double kMaxWordSizeRatio = 1.25; | |
| 59 // Max ratio of line box height to line size to allow a new word to be added. | |
| 60 const double kMaxLineSizeRatio = 1.25; | |
| 61 // Max ratio of word gap to line size to allow a new word to be added. | |
| 62 const double kMaxWordGapRatio = 2.0; | |
| 63 | |
| 64 // Computes and returns a threshold of certainty difference used to determine | |
| 65 // which words to keep, based on the adjustment factors of the two words. | |
| 66 // TODO(rays) This is horrible. Replace with an enhance params training model. | |
| 67 static double StopperAmbigThreshold(double f1, double f2) { | |
| 68 return (f2 - f1) * kStopperAmbiguityThresholdGain - | |
| 69 kStopperAmbiguityThresholdOffset; | |
| 70 } | |
| 71 | |
| 72 /************************************************************************* | |
| 73 * PAGE_RES::PAGE_RES | |
| 74 * | |
| 75 * Constructor for page results | |
| 76 *************************************************************************/ | |
| 77 PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list, | |
| 78 WERD_CHOICE **prev_word_best_choice_ptr) { | |
| 79 Init(); | |
| 80 BLOCK_IT block_it(the_block_list); | |
| 81 BLOCK_RES_IT block_res_it(&block_res_list); | |
| 82 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 83 block_res_it.add_to_end( | |
| 84 new BLOCK_RES(merge_similar_words, block_it.data())); | |
| 85 } | |
| 86 prev_word_best_choice = prev_word_best_choice_ptr; | |
| 87 } | |
| 88 | |
| 89 /************************************************************************* | |
| 90 * BLOCK_RES::BLOCK_RES | |
| 91 * | |
| 92 * Constructor for BLOCK results | |
| 93 *************************************************************************/ | |
| 94 | |
| 95 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) { | |
| 96 ROW_IT row_it(the_block->row_list()); | |
| 97 ROW_RES_IT row_res_it(&row_res_list); | |
| 98 | |
| 99 char_count = 0; | |
| 100 rej_count = 0; | |
| 101 font_class = -1; // not assigned | |
| 102 x_height = -1.0; | |
| 103 font_assigned = false; | |
| 104 row_count = 0; | |
| 105 | |
| 106 block = the_block; | |
| 107 | |
| 108 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 109 row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data())); | |
| 110 } | |
| 111 } | |
| 112 | |
| 113 /************************************************************************* | |
| 114 * ROW_RES::ROW_RES | |
| 115 * | |
| 116 * Constructor for ROW results | |
| 117 *************************************************************************/ | |
| 118 | |
| 119 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) { | |
| 120 WERD_IT word_it(the_row->word_list()); | |
| 121 WERD_RES_IT word_res_it(&word_res_list); | |
| 122 WERD_RES *combo = nullptr; // current combination of fuzzies | |
| 123 WERD *copy_word; | |
| 124 | |
| 125 char_count = 0; | |
| 126 rej_count = 0; | |
| 127 whole_word_rej_count = 0; | |
| 128 | |
| 129 row = the_row; | |
| 130 bool add_next_word = false; | |
| 131 TBOX union_box; | |
| 132 float line_height = | |
| 133 the_row->x_height() + the_row->ascenders() - the_row->descenders(); | |
| 134 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 135 auto *word_res = new WERD_RES(word_it.data()); | |
| 136 word_res->x_height = the_row->x_height(); | |
| 137 if (add_next_word) { | |
| 138 ASSERT_HOST(combo != nullptr); | |
| 139 // We are adding this word to the combination. | |
| 140 word_res->part_of_combo = true; | |
| 141 combo->copy_on(word_res); | |
| 142 } else if (merge_similar_words) { | |
| 143 union_box = word_res->word->bounding_box(); | |
| 144 add_next_word = !word_res->word->flag(W_REP_CHAR) && | |
| 145 union_box.height() <= line_height * kMaxWordSizeRatio; | |
| 146 word_res->odd_size = !add_next_word; | |
| 147 } | |
| 148 WERD *next_word = word_it.data_relative(1); | |
| 149 if (merge_similar_words) { | |
| 150 if (add_next_word && !next_word->flag(W_REP_CHAR)) { | |
| 151 // Next word will be added on if all of the following are true: | |
| 152 // Not a rep char. | |
| 153 // Box height small enough. | |
| 154 // Union box height small enough. | |
| 155 // Horizontal gap small enough. | |
| 156 TBOX next_box = next_word->bounding_box(); | |
| 157 int prev_right = union_box.right(); | |
| 158 union_box += next_box; | |
| 159 if (next_box.height() > line_height * kMaxWordSizeRatio || | |
| 160 union_box.height() > line_height * kMaxLineSizeRatio || | |
| 161 next_box.left() > prev_right + line_height * kMaxWordGapRatio) { | |
| 162 add_next_word = false; | |
| 163 } | |
| 164 } | |
| 165 next_word->set_flag(W_FUZZY_NON, add_next_word); | |
| 166 } else { | |
| 167 add_next_word = next_word->flag(W_FUZZY_NON); | |
| 168 } | |
| 169 if (add_next_word) { | |
| 170 if (combo == nullptr) { | |
| 171 copy_word = new WERD; | |
| 172 *copy_word = *(word_it.data()); // deep copy | |
| 173 combo = new WERD_RES(copy_word); | |
| 174 combo->x_height = the_row->x_height(); | |
| 175 combo->combination = true; | |
| 176 word_res_it.add_to_end(combo); | |
| 177 } | |
| 178 word_res->part_of_combo = true; | |
| 179 } else { | |
| 180 combo = nullptr; | |
| 181 } | |
| 182 word_res_it.add_to_end(word_res); | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 WERD_RES &WERD_RES::operator=(const WERD_RES &source) { | |
| 187 this->ELIST_LINK::operator=(source); | |
| 188 Clear(); | |
| 189 if (source.combination) { | |
| 190 word = new WERD; | |
| 191 *word = *(source.word); // deep copy | |
| 192 } else { | |
| 193 word = source.word; // pt to same word | |
| 194 } | |
| 195 if (source.bln_boxes != nullptr) { | |
| 196 bln_boxes = new tesseract::BoxWord(*source.bln_boxes); | |
| 197 } | |
| 198 if (source.chopped_word != nullptr) { | |
| 199 chopped_word = new TWERD(*source.chopped_word); | |
| 200 } | |
| 201 if (source.rebuild_word != nullptr) { | |
| 202 rebuild_word = new TWERD(*source.rebuild_word); | |
| 203 } | |
| 204 // TODO(rays) Do we ever need to copy the seam_array? | |
| 205 blob_row = source.blob_row; | |
| 206 denorm = source.denorm; | |
| 207 if (source.box_word != nullptr) { | |
| 208 box_word = new tesseract::BoxWord(*source.box_word); | |
| 209 } | |
| 210 best_state = source.best_state; | |
| 211 correct_text = source.correct_text; | |
| 212 blob_widths = source.blob_widths; | |
| 213 blob_gaps = source.blob_gaps; | |
| 214 // None of the uses of operator= require the ratings matrix to be copied, | |
| 215 // so don't as it would be really slow. | |
| 216 | |
| 217 // Copy the cooked choices. | |
| 218 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices)); | |
| 219 WERD_CHOICE_IT wc_dest_it(&best_choices); | |
| 220 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { | |
| 221 const WERD_CHOICE *choice = wc_it.data(); | |
| 222 wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice)); | |
| 223 } | |
| 224 if (!wc_dest_it.empty()) { | |
| 225 wc_dest_it.move_to_first(); | |
| 226 best_choice = wc_dest_it.data(); | |
| 227 } else { | |
| 228 best_choice = nullptr; | |
| 229 } | |
| 230 | |
| 231 if (source.raw_choice != nullptr) { | |
| 232 raw_choice = new WERD_CHOICE(*source.raw_choice); | |
| 233 } else { | |
| 234 raw_choice = nullptr; | |
| 235 } | |
| 236 if (source.ep_choice != nullptr) { | |
| 237 ep_choice = new WERD_CHOICE(*source.ep_choice); | |
| 238 } else { | |
| 239 ep_choice = nullptr; | |
| 240 } | |
| 241 reject_map = source.reject_map; | |
| 242 combination = source.combination; | |
| 243 part_of_combo = source.part_of_combo; | |
| 244 CopySimpleFields(source); | |
| 245 if (source.blamer_bundle != nullptr) { | |
| 246 blamer_bundle = new BlamerBundle(*(source.blamer_bundle)); | |
| 247 } | |
| 248 return *this; | |
| 249 } | |
| 250 | |
| 251 // Copies basic fields that don't involve pointers that might be useful | |
| 252 // to copy when making one WERD_RES from another. | |
| 253 void WERD_RES::CopySimpleFields(const WERD_RES &source) { | |
| 254 tess_failed = source.tess_failed; | |
| 255 tess_accepted = source.tess_accepted; | |
| 256 tess_would_adapt = source.tess_would_adapt; | |
| 257 done = source.done; | |
| 258 unlv_crunch_mode = source.unlv_crunch_mode; | |
| 259 small_caps = source.small_caps; | |
| 260 odd_size = source.odd_size; | |
| 261 fontinfo = source.fontinfo; | |
| 262 fontinfo2 = source.fontinfo2; | |
| 263 fontinfo_id_count = source.fontinfo_id_count; | |
| 264 fontinfo_id2_count = source.fontinfo_id2_count; | |
| 265 x_height = source.x_height; | |
| 266 caps_height = source.caps_height; | |
| 267 baseline_shift = source.baseline_shift; | |
| 268 guessed_x_ht = source.guessed_x_ht; | |
| 269 guessed_caps_ht = source.guessed_caps_ht; | |
| 270 reject_spaces = source.reject_spaces; | |
| 271 uch_set = source.uch_set; | |
| 272 tesseract = source.tesseract; | |
| 273 } | |
| 274 | |
| 275 // Initializes a blank (default constructed) WERD_RES from one that has | |
| 276 // already been recognized. | |
| 277 // Use SetupFor*Recognition afterwards to complete the setup and make | |
| 278 // it ready for a retry recognition. | |
| 279 void WERD_RES::InitForRetryRecognition(const WERD_RES &source) { | |
| 280 word = source.word; | |
| 281 CopySimpleFields(source); | |
| 282 if (source.blamer_bundle != nullptr) { | |
| 283 blamer_bundle = new BlamerBundle(); | |
| 284 blamer_bundle->CopyTruth(*source.blamer_bundle); | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 // Sets up the members used in recognition: bln_boxes, chopped_word, | |
| 289 // seam_array, denorm. Returns false if | |
| 290 // the word is empty and sets up fake results. If use_body_size is | |
| 291 // true and row->body_size is set, then body_size will be used for | |
| 292 // blob normalization instead of xheight + ascrise. This flag is for | |
| 293 // those languages that are using CJK pitch model and thus it has to | |
| 294 // be true if and only if tesseract->textord_use_cjk_fp_model is | |
| 295 // true. | |
| 296 // If allow_detailed_fx is true, the feature extractor will receive fine | |
| 297 // precision outline information, allowing smoother features and better | |
| 298 // features on low resolution images. | |
| 299 // The norm_mode_hint sets the default mode for normalization in absence | |
| 300 // of any of the above flags. | |
| 301 // norm_box is used to override the word bounding box to determine the | |
| 302 // normalization scale and offset. | |
| 303 // Returns false if the word is empty and sets up fake results. | |
| 304 bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in, | |
| 305 tesseract::Tesseract *tess, Image pix, | |
| 306 int norm_mode, const TBOX *norm_box, | |
| 307 bool numeric_mode, bool use_body_size, | |
| 308 bool allow_detailed_fx, ROW *row, | |
| 309 const BLOCK *block) { | |
| 310 auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode); | |
| 311 tesseract = tess; | |
| 312 POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr; | |
| 313 if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY && | |
| 314 word->cblob_list()->empty()) || | |
| 315 (pb != nullptr && !pb->IsText())) { | |
| 316 // Empty words occur when all the blobs have been moved to the rej_blobs | |
| 317 // list, which seems to occur frequently in junk. | |
| 318 SetupFake(unicharset_in); | |
| 319 word->set_flag(W_REP_CHAR, false); | |
| 320 return false; | |
| 321 } | |
| 322 ClearResults(); | |
| 323 SetupWordScript(unicharset_in); | |
| 324 chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word); | |
| 325 float word_xheight = | |
| 326 use_body_size && row != nullptr && row->body_size() > 0.0f | |
| 327 ? row->body_size() | |
| 328 : x_height; | |
| 329 chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE), | |
| 330 word_xheight, baseline_shift, numeric_mode, | |
| 331 norm_mode_hint, norm_box, &denorm); | |
| 332 blob_row = row; | |
| 333 SetupBasicsFromChoppedWord(unicharset_in); | |
| 334 SetupBlamerBundle(); | |
| 335 int num_blobs = chopped_word->NumBlobs(); | |
| 336 ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks); | |
| 337 tess_failed = false; | |
| 338 return true; | |
| 339 } | |
| 340 | |
| 341 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty | |
| 342 // accumulators from a made chopped word. We presume the fields are already | |
| 343 // empty. | |
| 344 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) { | |
| 345 bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word); | |
| 346 start_seam_list(chopped_word, &seam_array); | |
| 347 SetupBlobWidthsAndGaps(); | |
| 348 ClearWordChoices(); | |
| 349 } | |
| 350 | |
| 351 // Sets up the members used in recognition for an empty recognition result: | |
| 352 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. | |
| 353 void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) { | |
| 354 ClearResults(); | |
| 355 SetupWordScript(unicharset_in); | |
| 356 chopped_word = new TWERD; | |
| 357 rebuild_word = new TWERD; | |
| 358 bln_boxes = new tesseract::BoxWord; | |
| 359 box_word = new tesseract::BoxWord; | |
| 360 int blob_count = word->cblob_list()->length(); | |
| 361 if (blob_count > 0) { | |
| 362 auto **fake_choices = new BLOB_CHOICE *[blob_count]; | |
| 363 // For non-text blocks, just pass any blobs through to the box_word | |
| 364 // and call the word failed with a fake classification. | |
| 365 C_BLOB_IT b_it(word->cblob_list()); | |
| 366 int blob_id = 0; | |
| 367 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 368 TBOX box = b_it.data()->bounding_box(); | |
| 369 box_word->InsertBox(box_word->length(), box); | |
| 370 fake_choices[blob_id++] = new BLOB_CHOICE; | |
| 371 } | |
| 372 FakeClassifyWord(blob_count, fake_choices); | |
| 373 delete[] fake_choices; | |
| 374 } else { | |
| 375 auto *word = new WERD_CHOICE(&unicharset_in); | |
| 376 word->make_bad(); | |
| 377 LogNewRawChoice(word); | |
| 378 // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice. | |
| 379 LogNewCookedChoice(1, false, word); | |
| 380 } | |
| 381 tess_failed = true; | |
| 382 done = true; | |
| 383 } | |
| 384 | |
| 385 void WERD_RES::SetupWordScript(const UNICHARSET &uch) { | |
| 386 uch_set = &uch; | |
| 387 int script = uch.default_sid(); | |
| 388 word->set_script_id(script); | |
| 389 word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight()); | |
| 390 word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid()); | |
| 391 } | |
| 392 | |
| 393 // Sets up the blamer_bundle if it is not null, using the initialized denorm. | |
| 394 void WERD_RES::SetupBlamerBundle() { | |
| 395 if (blamer_bundle != nullptr) { | |
| 396 blamer_bundle->SetupNormTruthWord(denorm); | |
| 397 } | |
| 398 } | |
| 399 | |
| 400 // Computes the blob_widths and blob_gaps from the chopped_word. | |
| 401 void WERD_RES::SetupBlobWidthsAndGaps() { | |
| 402 blob_widths.clear(); | |
| 403 blob_gaps.clear(); | |
| 404 int num_blobs = chopped_word->NumBlobs(); | |
| 405 for (int b = 0; b < num_blobs; ++b) { | |
| 406 TBLOB *blob = chopped_word->blobs[b]; | |
| 407 TBOX box = blob->bounding_box(); | |
| 408 blob_widths.push_back(box.width()); | |
| 409 if (b + 1 < num_blobs) { | |
| 410 blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() - | |
| 411 box.right()); | |
| 412 } | |
| 413 } | |
| 414 } | |
| 415 | |
| 416 // Updates internal data to account for a new SEAM (chop) at the given | |
| 417 // blob_number. Fixes the ratings matrix and states in the choices, as well | |
| 418 // as the blob widths and gaps. | |
| 419 void WERD_RES::InsertSeam(int blob_number, SEAM *seam) { | |
| 420 // Insert the seam into the SEAMS array. | |
| 421 seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true); | |
| 422 seam_array.insert(seam_array.begin() + blob_number, seam); | |
| 423 if (ratings != nullptr) { | |
| 424 // Expand the ratings matrix. | |
| 425 ratings = ratings->ConsumeAndMakeBigger(blob_number); | |
| 426 // Fix all the segmentation states. | |
| 427 if (raw_choice != nullptr) { | |
| 428 raw_choice->UpdateStateForSplit(blob_number); | |
| 429 } | |
| 430 WERD_CHOICE_IT wc_it(&best_choices); | |
| 431 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { | |
| 432 WERD_CHOICE *choice = wc_it.data(); | |
| 433 choice->UpdateStateForSplit(blob_number); | |
| 434 } | |
| 435 SetupBlobWidthsAndGaps(); | |
| 436 } | |
| 437 } | |
| 438 | |
| 439 // Returns true if all the word choices except the first have adjust_factors | |
| 440 // worse than the given threshold. | |
| 441 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const { | |
| 442 // The choices are not changed by this iteration. | |
| 443 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices)); | |
| 444 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) { | |
| 445 WERD_CHOICE *choice = wc_it.data(); | |
| 446 if (choice->adjust_factor() <= threshold) { | |
| 447 return false; | |
| 448 } | |
| 449 } | |
| 450 return true; | |
| 451 } | |
| 452 | |
| 453 // Returns true if the current word is ambiguous (by number of answers or | |
| 454 // by dangerous ambigs.) | |
| 455 bool WERD_RES::IsAmbiguous() { | |
| 456 return !best_choices.singleton() || best_choice->dangerous_ambig_found(); | |
| 457 } | |
| 458 | |
| 459 // Returns true if the ratings matrix size matches the sum of each of the | |
| 460 // segmentation states. | |
| 461 bool WERD_RES::StatesAllValid() { | |
| 462 unsigned ratings_dim = ratings->dimension(); | |
| 463 if (raw_choice->TotalOfStates() != ratings_dim) { | |
| 464 tprintf("raw_choice has total of states = %u vs ratings dim of %u\n", | |
| 465 raw_choice->TotalOfStates(), ratings_dim); | |
| 466 return false; | |
| 467 } | |
| 468 WERD_CHOICE_IT it(&best_choices); | |
| 469 unsigned index = 0; | |
| 470 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { | |
| 471 WERD_CHOICE *choice = it.data(); | |
| 472 if (choice->TotalOfStates() != ratings_dim) { | |
| 473 tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n", | |
| 474 index, choice->TotalOfStates(), ratings_dim); | |
| 475 return false; | |
| 476 } | |
| 477 } | |
| 478 return true; | |
| 479 } | |
| 480 | |
| 481 // Prints a list of words found if debug is true or the word result matches | |
| 482 // the word_to_debug. | |
| 483 void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) { | |
| 484 if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' && | |
| 485 best_choice != nullptr && | |
| 486 best_choice->unichar_string() == std::string(word_to_debug))) { | |
| 487 if (raw_choice != nullptr) { | |
| 488 raw_choice->print("\nBest Raw Choice"); | |
| 489 } | |
| 490 | |
| 491 WERD_CHOICE_IT it(&best_choices); | |
| 492 int index = 0; | |
| 493 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { | |
| 494 WERD_CHOICE *choice = it.data(); | |
| 495 std::string label; | |
| 496 label += "\nCooked Choice #" + std::to_string(index); | |
| 497 choice->print(label.c_str()); | |
| 498 } | |
| 499 } | |
| 500 } | |
| 501 | |
| 502 // Prints the top choice along with the accepted/done flags. | |
| 503 void WERD_RES::DebugTopChoice(const char *msg) const { | |
| 504 tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted, | |
| 505 tess_would_adapt, done); | |
| 506 if (best_choice == nullptr) { | |
| 507 tprintf("<Null choice>\n"); | |
| 508 } else { | |
| 509 best_choice->print(msg); | |
| 510 } | |
| 511 } | |
| 512 | |
| 513 // Removes from best_choices all choices which are not within a reasonable | |
| 514 // range of the best choice. | |
| 515 // TODO(rays) incorporate the information used here into the params training | |
| 516 // re-ranker, in place of this heuristic that is based on the previous | |
| 517 // adjustment factor. | |
| 518 void WERD_RES::FilterWordChoices(int debug_level) { | |
| 519 if (best_choice == nullptr || best_choices.singleton()) { | |
| 520 return; | |
| 521 } | |
| 522 | |
| 523 if (debug_level >= 2) { | |
| 524 best_choice->print("\nFiltering against best choice"); | |
| 525 } | |
| 526 WERD_CHOICE_IT it(&best_choices); | |
| 527 int index = 0; | |
| 528 for (it.forward(); !it.at_first(); it.forward(), ++index) { | |
| 529 WERD_CHOICE *choice = it.data(); | |
| 530 float threshold = StopperAmbigThreshold(best_choice->adjust_factor(), | |
| 531 choice->adjust_factor()); | |
| 532 // i, j index the blob choice in choice, best_choice. | |
| 533 // chunk is an index into the chopped_word blobs (AKA chunks). | |
| 534 // Since the two words may use different segmentations of the chunks, we | |
| 535 // iterate over the chunks to find out whether a comparable blob | |
| 536 // classification is much worse than the best result. | |
| 537 unsigned i = 0, j = 0, chunk = 0; | |
| 538 // Each iteration of the while deals with 1 chunk. On entry choice_chunk | |
| 539 // and best_chunk are the indices of the first chunk in the NEXT blob, | |
| 540 // i.e. we don't have to increment i, j while chunk < choice_chunk and | |
| 541 // best_chunk respectively. | |
| 542 auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0); | |
| 543 while (i < choice->length() && j < best_choice->length()) { | |
| 544 if (choice->unichar_id(i) != best_choice->unichar_id(j) && | |
| 545 choice->certainty(i) - best_choice->certainty(j) < threshold) { | |
| 546 if (debug_level >= 2) { | |
| 547 choice->print("WorstCertaintyDiffWorseThan"); | |
| 548 tprintf( | |
| 549 "i %u j %u Choice->Blob[i].Certainty %.4g" | |
| 550 " WorstOtherChoiceCertainty %g Threshold %g\n", | |
| 551 i, j, choice->certainty(i), best_choice->certainty(j), threshold); | |
| 552 tprintf("Discarding bad choice #%d\n", index); | |
| 553 } | |
| 554 delete it.extract(); | |
| 555 break; | |
| 556 } | |
| 557 ++chunk; | |
| 558 // If needed, advance choice_chunk to keep up with chunk. | |
| 559 while (choice_chunk < chunk && ++i < choice->length()) { | |
| 560 choice_chunk += choice->state(i); | |
| 561 } | |
| 562 // If needed, advance best_chunk to keep up with chunk. | |
| 563 while (best_chunk < chunk && ++j < best_choice->length()) { | |
| 564 best_chunk += best_choice->state(j); | |
| 565 } | |
| 566 } | |
| 567 } | |
| 568 } | |
| 569 | |
| 570 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale, | |
| 571 float min_rating, float max_rating, | |
| 572 float rating_margin, | |
| 573 float *thresholds) { | |
| 574 int chunk = 0; | |
| 575 int end_chunk = best_choice->state(0); | |
| 576 int end_raw_chunk = raw_choice->state(0); | |
| 577 int raw_blob = 0; | |
| 578 for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) { | |
| 579 float avg_rating = 0.0f; | |
| 580 int num_error_chunks = 0; | |
| 581 | |
| 582 // For each chunk in best choice blob i, count non-matching raw results. | |
| 583 while (chunk < end_chunk) { | |
| 584 if (chunk >= end_raw_chunk) { | |
| 585 ++raw_blob; | |
| 586 end_raw_chunk += raw_choice->state(raw_blob); | |
| 587 } | |
| 588 if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) { | |
| 589 avg_rating += raw_choice->certainty(raw_blob); | |
| 590 ++num_error_chunks; | |
| 591 } | |
| 592 ++chunk; | |
| 593 } | |
| 594 | |
| 595 if (num_error_chunks > 0) { | |
| 596 avg_rating /= num_error_chunks; | |
| 597 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin); | |
| 598 } else { | |
| 599 *thresholds = max_rating; | |
| 600 } | |
| 601 | |
| 602 if (*thresholds > max_rating) { | |
| 603 *thresholds = max_rating; | |
| 604 } | |
| 605 if (*thresholds < min_rating) { | |
| 606 *thresholds = min_rating; | |
| 607 } | |
| 608 } | |
| 609 } | |
| 610 | |
| 611 // Saves a copy of the word_choice if it has the best unadjusted rating. | |
| 612 // Returns true if the word_choice was the new best. | |
| 613 bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) { | |
| 614 if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) { | |
| 615 delete raw_choice; | |
| 616 raw_choice = new WERD_CHOICE(*word_choice); | |
| 617 raw_choice->set_permuter(TOP_CHOICE_PERM); | |
| 618 return true; | |
| 619 } | |
| 620 return false; | |
| 621 } | |
| 622 | |
| 623 // Consumes word_choice by adding it to best_choices, (taking ownership) if | |
| 624 // the certainty for word_choice is some distance of the best choice in | |
| 625 // best_choices, or by deleting the word_choice and returning false. | |
| 626 // The best_choices list is kept in sorted order by rating. Duplicates are | |
| 627 // removed, and the list is kept no longer than max_num_choices in length. | |
| 628 // Returns true if the word_choice is still a valid pointer. | |
| 629 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug, | |
| 630 WERD_CHOICE *word_choice) { | |
| 631 if (best_choice != nullptr) { | |
| 632 // Throw out obviously bad choices to save some work. | |
| 633 // TODO(rays) Get rid of this! This piece of code produces different | |
| 634 // results according to the order in which words are found, which is an | |
| 635 // undesirable behavior. It would be better to keep all the choices and | |
| 636 // prune them later when more information is available. | |
| 637 float max_certainty_delta = StopperAmbigThreshold( | |
| 638 best_choice->adjust_factor(), word_choice->adjust_factor()); | |
| 639 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) { | |
| 640 max_certainty_delta = -kStopperAmbiguityThresholdOffset; | |
| 641 } | |
| 642 if (word_choice->certainty() - best_choice->certainty() < | |
| 643 max_certainty_delta) { | |
| 644 if (debug) { | |
| 645 std::string bad_string; | |
| 646 word_choice->string_and_lengths(&bad_string, nullptr); | |
| 647 tprintf( | |
| 648 "Discarding choice \"%s\" with an overly low certainty" | |
| 649 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n", | |
| 650 bad_string.c_str(), word_choice->certainty(), | |
| 651 best_choice->certainty(), | |
| 652 max_certainty_delta + best_choice->certainty()); | |
| 653 } | |
| 654 delete word_choice; | |
| 655 return false; | |
| 656 } | |
| 657 } | |
| 658 | |
| 659 // Insert in the list in order of increasing rating, but knock out worse | |
| 660 // string duplicates. | |
| 661 WERD_CHOICE_IT it(&best_choices); | |
| 662 const std::string &new_str = word_choice->unichar_string(); | |
| 663 bool inserted = false; | |
| 664 int num_choices = 0; | |
| 665 if (!it.empty()) { | |
| 666 do { | |
| 667 WERD_CHOICE *choice = it.data(); | |
| 668 if (choice->rating() > word_choice->rating() && !inserted) { | |
| 669 // Time to insert. | |
| 670 it.add_before_stay_put(word_choice); | |
| 671 inserted = true; | |
| 672 if (num_choices == 0) { | |
| 673 best_choice = word_choice; // This is the new best. | |
| 674 } | |
| 675 ++num_choices; | |
| 676 } | |
| 677 if (choice->unichar_string() == new_str) { | |
| 678 if (inserted) { | |
| 679 // New is better. | |
| 680 delete it.extract(); | |
| 681 } else { | |
| 682 // Old is better. | |
| 683 if (debug) { | |
| 684 tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n", | |
| 685 new_str.c_str(), word_choice->rating(), choice->rating()); | |
| 686 } | |
| 687 delete word_choice; | |
| 688 return false; | |
| 689 } | |
| 690 } else { | |
| 691 ++num_choices; | |
| 692 if (num_choices > max_num_choices) { | |
| 693 delete it.extract(); | |
| 694 } | |
| 695 } | |
| 696 it.forward(); | |
| 697 } while (!it.at_first()); | |
| 698 } | |
| 699 if (!inserted && num_choices < max_num_choices) { | |
| 700 it.add_to_end(word_choice); | |
| 701 inserted = true; | |
| 702 if (num_choices == 0) { | |
| 703 best_choice = word_choice; // This is the new best. | |
| 704 } | |
| 705 } | |
| 706 if (debug) { | |
| 707 if (inserted) { | |
| 708 tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary"); | |
| 709 } else { | |
| 710 tprintf("Poor"); | |
| 711 } | |
| 712 word_choice->print(" Word Choice"); | |
| 713 } | |
| 714 if (!inserted) { | |
| 715 delete word_choice; | |
| 716 return false; | |
| 717 } | |
| 718 return true; | |
| 719 } | |
| 720 | |
| 721 // Simple helper moves the ownership of the pointer data from src to dest, | |
| 722 // first deleting anything in dest, and nulling out src afterwards. | |
| 723 template <class T> | |
| 724 static void MovePointerData(T **dest, T **src) { | |
| 725 delete *dest; | |
| 726 *dest = *src; | |
| 727 *src = nullptr; | |
| 728 } | |
| 729 | |
| 730 // Prints a brief list of all the best choices. | |
| 731 void WERD_RES::PrintBestChoices() const { | |
| 732 std::string alternates_str; | |
| 733 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices)); | |
| 734 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 735 if (!it.at_first()) { | |
| 736 alternates_str += "\", \""; | |
| 737 } | |
| 738 alternates_str += it.data()->unichar_string(); | |
| 739 } | |
| 740 tprintf("Alternates for \"%s\": {\"%s\"}\n", | |
| 741 best_choice->unichar_string().c_str(), alternates_str.c_str()); | |
| 742 } | |
| 743 | |
| 744 // Returns the sum of the widths of the blob between start_blob and last_blob | |
| 745 // inclusive. | |
| 746 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const { | |
| 747 int result = 0; | |
| 748 for (int b = start_blob; b <= last_blob; ++b) { | |
| 749 result += blob_widths[b]; | |
| 750 if (b < last_blob) { | |
| 751 result += blob_gaps[b]; | |
| 752 } | |
| 753 } | |
| 754 return result; | |
| 755 } | |
| 756 // Returns the width of a gap between the specified blob and the next one. | |
| 757 int WERD_RES::GetBlobsGap(unsigned blob_index) const { | |
| 758 if (blob_index >= blob_gaps.size()) { | |
| 759 return 0; | |
| 760 } | |
| 761 return blob_gaps[blob_index]; | |
| 762 } | |
| 763 | |
| 764 // Returns the BLOB_CHOICE corresponding to the given index in the | |
| 765 // best choice word taken from the appropriate cell in the ratings MATRIX. | |
| 766 // Borrowed pointer, so do not delete. May return nullptr if there is no | |
| 767 // BLOB_CHOICE matching the unichar_id at the given index. | |
| 768 BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const { | |
| 769 if (index >= best_choice->length()) { | |
| 770 return nullptr; | |
| 771 } | |
| 772 BLOB_CHOICE_LIST *choices = GetBlobChoices(index); | |
| 773 return FindMatchingChoice(best_choice->unichar_id(index), choices); | |
| 774 } | |
| 775 | |
| 776 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the | |
| 777 // best choice word taken from the appropriate cell in the ratings MATRIX. | |
| 778 // Borrowed pointer, so do not delete. | |
| 779 BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const { | |
| 780 return best_choice->blob_choices(index, ratings); | |
| 781 } | |
| 782 | |
| 783 // Moves the results fields from word to this. This takes ownership of all | |
| 784 // the data, so src can be destructed. | |
| 785 void WERD_RES::ConsumeWordResults(WERD_RES *word) { | |
| 786 denorm = word->denorm; | |
| 787 blob_row = word->blob_row; | |
| 788 MovePointerData(&chopped_word, &word->chopped_word); | |
| 789 MovePointerData(&rebuild_word, &word->rebuild_word); | |
| 790 MovePointerData(&box_word, &word->box_word); | |
| 791 for (auto data : seam_array) { | |
| 792 delete data; | |
| 793 } | |
| 794 seam_array = word->seam_array; | |
| 795 word->seam_array.clear(); | |
| 796 // TODO: optimize moves. | |
| 797 best_state = word->best_state; | |
| 798 word->best_state.clear(); | |
| 799 correct_text = word->correct_text; | |
| 800 word->correct_text.clear(); | |
| 801 blob_widths = word->blob_widths; | |
| 802 word->blob_widths.clear(); | |
| 803 blob_gaps = word->blob_gaps; | |
| 804 word->blob_gaps.clear(); | |
| 805 if (ratings != nullptr) { | |
| 806 ratings->delete_matrix_pointers(); | |
| 807 } | |
| 808 MovePointerData(&ratings, &word->ratings); | |
| 809 best_choice = word->best_choice; | |
| 810 MovePointerData(&raw_choice, &word->raw_choice); | |
| 811 best_choices.clear(); | |
| 812 WERD_CHOICE_IT wc_it(&best_choices); | |
| 813 wc_it.add_list_after(&word->best_choices); | |
| 814 reject_map = word->reject_map; | |
| 815 if (word->blamer_bundle != nullptr) { | |
| 816 assert(blamer_bundle != nullptr); | |
| 817 blamer_bundle->CopyResults(*(word->blamer_bundle)); | |
| 818 } | |
| 819 CopySimpleFields(*word); | |
| 820 } | |
| 821 | |
| 822 // Replace the best choice and rebuild box word. | |
| 823 // choice must be from the current best_choices list. | |
| 824 void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) { | |
| 825 best_choice = choice; | |
| 826 RebuildBestState(); | |
| 827 SetupBoxWord(); | |
| 828 // Make up a fake reject map of the right length to keep the | |
| 829 // rejection pass happy. | |
| 830 reject_map.initialise(best_state.size()); | |
| 831 done = tess_accepted = tess_would_adapt = true; | |
| 832 SetScriptPositions(); | |
| 833 } | |
| 834 | |
| 835 // Builds the rebuild_word and sets the best_state from the chopped_word and | |
| 836 // the best_choice->state. | |
| 837 void WERD_RES::RebuildBestState() { | |
| 838 ASSERT_HOST(best_choice != nullptr); | |
| 839 delete rebuild_word; | |
| 840 rebuild_word = new TWERD; | |
| 841 if (seam_array.empty()) { | |
| 842 start_seam_list(chopped_word, &seam_array); | |
| 843 } | |
| 844 best_state.clear(); | |
| 845 int start = 0; | |
| 846 for (unsigned i = 0; i < best_choice->length(); ++i) { | |
| 847 int length = best_choice->state(i); | |
| 848 best_state.push_back(length); | |
| 849 if (length > 1) { | |
| 850 SEAM::JoinPieces(seam_array, chopped_word->blobs, start, | |
| 851 start + length - 1); | |
| 852 } | |
| 853 TBLOB *blob = chopped_word->blobs[start]; | |
| 854 rebuild_word->blobs.push_back(new TBLOB(*blob)); | |
| 855 if (length > 1) { | |
| 856 SEAM::BreakPieces(seam_array, chopped_word->blobs, start, | |
| 857 start + length - 1); | |
| 858 } | |
| 859 start += length; | |
| 860 } | |
| 861 } | |
| 862 | |
| 863 // Copies the chopped_word to the rebuild_word, faking a best_state as well. | |
| 864 // Also sets up the output box_word. | |
| 865 void WERD_RES::CloneChoppedToRebuild() { | |
| 866 delete rebuild_word; | |
| 867 rebuild_word = new TWERD(*chopped_word); | |
| 868 SetupBoxWord(); | |
| 869 auto word_len = box_word->length(); | |
| 870 best_state.reserve(word_len); | |
| 871 correct_text.reserve(word_len); | |
| 872 for (unsigned i = 0; i < word_len; ++i) { | |
| 873 best_state.push_back(1); | |
| 874 correct_text.emplace_back(""); | |
| 875 } | |
| 876 } | |
| 877 | |
| 878 // Sets/replaces the box_word with one made from the rebuild_word. | |
| 879 void WERD_RES::SetupBoxWord() { | |
| 880 delete box_word; | |
| 881 rebuild_word->ComputeBoundingBoxes(); | |
| 882 box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word); | |
| 883 box_word->ClipToOriginalWord(denorm.block(), word); | |
| 884 } | |
| 885 | |
| 886 // Sets up the script positions in the output best_choice using the best_choice | |
| 887 // to get the unichars, and the unicharset to get the target positions. | |
| 888 void WERD_RES::SetScriptPositions() { | |
| 889 best_choice->SetScriptPositions(small_caps, chopped_word); | |
| 890 } | |
| 891 // Sets all the blobs in all the words (raw choice and best choices) to be | |
| 892 // the given position. (When a sub/superscript is recognized as a separate | |
| 893 // word, it falls victim to the rule that a whole word cannot be sub or | |
| 894 // superscript, so this function overrides that problem.) | |
| 895 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) { | |
| 896 raw_choice->SetAllScriptPositions(position); | |
| 897 WERD_CHOICE_IT wc_it(&best_choices); | |
| 898 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { | |
| 899 wc_it.data()->SetAllScriptPositions(position); | |
| 900 } | |
| 901 } | |
| 902 | |
| 903 // Classifies the word with some already-calculated BLOB_CHOICEs. | |
| 904 // The choices are an array of blob_count pointers to BLOB_CHOICE, | |
| 905 // providing a single classifier result for each blob. | |
| 906 // The BLOB_CHOICEs are consumed and the word takes ownership. | |
| 907 // The number of blobs in the box_word must match blob_count. | |
| 908 void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) { | |
| 909 // Setup the WERD_RES. | |
| 910 ASSERT_HOST(box_word != nullptr); | |
| 911 ASSERT_HOST(blob_count == box_word->length()); | |
| 912 ClearWordChoices(); | |
| 913 ClearRatings(); | |
| 914 ratings = new MATRIX(blob_count, 1); | |
| 915 for (unsigned c = 0; c < blob_count; ++c) { | |
| 916 auto *choice_list = new BLOB_CHOICE_LIST; | |
| 917 BLOB_CHOICE_IT choice_it(choice_list); | |
| 918 choice_it.add_after_then_move(choices[c]); | |
| 919 ratings->put(c, c, choice_list); | |
| 920 } | |
| 921 FakeWordFromRatings(TOP_CHOICE_PERM); | |
| 922 reject_map.initialise(blob_count); | |
| 923 best_state.clear(); | |
| 924 best_state.resize(blob_count, 1); | |
| 925 done = true; | |
| 926 } | |
| 927 | |
| 928 // Creates a WERD_CHOICE for the word using the top choices from the leading | |
| 929 // diagonal of the ratings matrix. | |
| 930 void WERD_RES::FakeWordFromRatings(PermuterType permuter) { | |
| 931 int num_blobs = ratings->dimension(); | |
| 932 auto *word_choice = new WERD_CHOICE(uch_set, num_blobs); | |
| 933 word_choice->set_permuter(permuter); | |
| 934 for (int b = 0; b < num_blobs; ++b) { | |
| 935 UNICHAR_ID unichar_id = UNICHAR_SPACE; | |
| 936 // Initialize rating and certainty like in WERD_CHOICE::make_bad(). | |
| 937 float rating = WERD_CHOICE::kBadRating; | |
| 938 float certainty = -FLT_MAX; | |
| 939 BLOB_CHOICE_LIST *choices = ratings->get(b, b); | |
| 940 if (choices != nullptr && !choices->empty()) { | |
| 941 BLOB_CHOICE_IT bc_it(choices); | |
| 942 BLOB_CHOICE *choice = bc_it.data(); | |
| 943 unichar_id = choice->unichar_id(); | |
| 944 rating = choice->rating(); | |
| 945 certainty = choice->certainty(); | |
| 946 } | |
| 947 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating, | |
| 948 certainty); | |
| 949 } | |
| 950 LogNewRawChoice(word_choice); | |
| 951 // Ownership of word_choice taken by word here. | |
| 952 LogNewCookedChoice(1, false, word_choice); | |
| 953 } | |
| 954 | |
| 955 // Copies the best_choice strings to the correct_text for adaption/training. | |
| 956 void WERD_RES::BestChoiceToCorrectText() { | |
| 957 correct_text.clear(); | |
| 958 ASSERT_HOST(best_choice != nullptr); | |
| 959 for (unsigned i = 0; i < best_choice->length(); ++i) { | |
| 960 UNICHAR_ID choice_id = best_choice->unichar_id(i); | |
| 961 const char *blob_choice = uch_set->id_to_unichar(choice_id); | |
| 962 correct_text.emplace_back(blob_choice); | |
| 963 } | |
| 964 } | |
| 965 | |
| 966 // Merges 2 adjacent blobs in the result if the permanent callback | |
| 967 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent | |
| 968 // callback box_cb is nullptr or returns true, setting the merged blob | |
| 969 // result to the class returned from class_cb. | |
| 970 // Returns true if anything was merged. | |
| 971 bool WERD_RES::ConditionalBlobMerge( | |
| 972 const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, | |
| 973 const std::function<bool(const TBOX &, const TBOX &)> &box_cb) { | |
| 974 ASSERT_HOST(best_choice->empty() || ratings != nullptr); | |
| 975 bool modified = false; | |
| 976 for (unsigned i = 0; i + 1 < best_choice->length(); ++i) { | |
| 977 UNICHAR_ID new_id = | |
| 978 class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1)); | |
| 979 if (new_id != INVALID_UNICHAR_ID && | |
| 980 (box_cb == nullptr || | |
| 981 box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) { | |
| 982 // Raw choice should not be fixed. | |
| 983 best_choice->set_unichar_id(new_id, i); | |
| 984 modified = true; | |
| 985 MergeAdjacentBlobs(i); | |
| 986 const MATRIX_COORD &coord = best_choice->MatrixCoord(i); | |
| 987 if (!coord.Valid(*ratings)) { | |
| 988 ratings->IncreaseBandSize(coord.row + 1 - coord.col); | |
| 989 } | |
| 990 BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i); | |
| 991 if (FindMatchingChoice(new_id, blob_choices) == nullptr) { | |
| 992 // Insert a fake result. | |
| 993 auto *blob_choice = new BLOB_CHOICE; | |
| 994 blob_choice->set_unichar_id(new_id); | |
| 995 BLOB_CHOICE_IT bc_it(blob_choices); | |
| 996 bc_it.add_before_then_move(blob_choice); | |
| 997 } | |
| 998 } | |
| 999 } | |
| 1000 return modified; | |
| 1001 } | |
| 1002 | |
| 1003 // Merges 2 adjacent blobs in the result (index and index+1) and corrects | |
| 1004 // all the data to account for the change. | |
| 1005 void WERD_RES::MergeAdjacentBlobs(unsigned index) { | |
| 1006 if (reject_map.length() == best_choice->length()) { | |
| 1007 reject_map.remove_pos(index); | |
| 1008 } | |
| 1009 best_choice->remove_unichar_id(index + 1); | |
| 1010 rebuild_word->MergeBlobs(index, index + 2); | |
| 1011 box_word->MergeBoxes(index, index + 2); | |
| 1012 if (index + 1 < best_state.size()) { | |
| 1013 best_state[index] += best_state[index + 1]; | |
| 1014 best_state.erase(best_state.begin() + index + 1); | |
| 1015 } | |
| 1016 } | |
| 1017 | |
| 1018 // TODO(tkielbus) Decide between keeping this behavior here or modifying the | |
| 1019 // training data. | |
| 1020 | |
| 1021 // Utility function for fix_quotes | |
| 1022 // Return true if the next character in the string (given the UTF8 length in | |
| 1023 // bytes) is a quote character. | |
| 1024 static int is_simple_quote(const char *signed_str, int length) { | |
| 1025 const auto *str = reinterpret_cast<const unsigned char *>(signed_str); | |
| 1026 // Standard 1 byte quotes. | |
| 1027 return (length == 1 && (*str == '\'' || *str == '`')) || | |
| 1028 // UTF-8 3 bytes curved quotes. | |
| 1029 (length == 3 && | |
| 1030 ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) || | |
| 1031 (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99))); | |
| 1032 } | |
| 1033 | |
| 1034 // Callback helper for fix_quotes returns a double quote if both | |
| 1035 // arguments are quote, otherwise INVALID_UNICHAR_ID. | |
| 1036 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { | |
| 1037 const char *ch = uch_set->id_to_unichar(id1); | |
| 1038 const char *next_ch = uch_set->id_to_unichar(id2); | |
| 1039 if (is_simple_quote(ch, strlen(ch)) && | |
| 1040 is_simple_quote(next_ch, strlen(next_ch))) { | |
| 1041 return uch_set->unichar_to_id("\""); | |
| 1042 } | |
| 1043 return INVALID_UNICHAR_ID; | |
| 1044 } | |
| 1045 | |
| 1046 // Change pairs of quotes to double quotes. | |
| 1047 void WERD_RES::fix_quotes() { | |
| 1048 if (!uch_set->contains_unichar("\"") || | |
| 1049 !uch_set->get_enabled(uch_set->unichar_to_id("\""))) { | |
| 1050 return; // Don't create it if it is disallowed. | |
| 1051 } | |
| 1052 | |
| 1053 using namespace std::placeholders; // for _1, _2 | |
| 1054 ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr); | |
| 1055 } | |
| 1056 | |
| 1057 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both | |
| 1058 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. | |
| 1059 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { | |
| 1060 const char *ch = uch_set->id_to_unichar(id1); | |
| 1061 const char *next_ch = uch_set->id_to_unichar(id2); | |
| 1062 if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') && | |
| 1063 (*next_ch == '-' || *next_ch == '~')) { | |
| 1064 return uch_set->unichar_to_id("-"); | |
| 1065 } | |
| 1066 return INVALID_UNICHAR_ID; | |
| 1067 } | |
| 1068 | |
| 1069 // Callback helper for fix_hyphens returns true if box1 and box2 overlap | |
| 1070 // (assuming both on the same textline, are in order and a chopped em dash.) | |
| 1071 bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) { | |
| 1072 return box1.right() >= box2.left(); | |
| 1073 } | |
| 1074 | |
| 1075 // Change pairs of hyphens to a single hyphen if the bounding boxes touch | |
| 1076 // Typically a long dash which has been segmented. | |
| 1077 void WERD_RES::fix_hyphens() { | |
| 1078 if (!uch_set->contains_unichar("-") || | |
| 1079 !uch_set->get_enabled(uch_set->unichar_to_id("-"))) { | |
| 1080 return; // Don't create it if it is disallowed. | |
| 1081 } | |
| 1082 | |
| 1083 using namespace std::placeholders; // for _1, _2 | |
| 1084 ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2), | |
| 1085 std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2)); | |
| 1086 } | |
| 1087 | |
| 1088 // Callback helper for merge_tess_fails returns a space if both | |
| 1089 // arguments are space, otherwise INVALID_UNICHAR_ID. | |
| 1090 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { | |
| 1091 if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) { | |
| 1092 return id1; | |
| 1093 } else { | |
| 1094 return INVALID_UNICHAR_ID; | |
| 1095 } | |
| 1096 } | |
| 1097 | |
| 1098 // Change pairs of tess failures to a single one | |
| 1099 void WERD_RES::merge_tess_fails() { | |
| 1100 using namespace std::placeholders; // for _1, _2 | |
| 1101 if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2), | |
| 1102 nullptr)) { | |
| 1103 unsigned len = best_choice->length(); | |
| 1104 ASSERT_HOST(reject_map.length() == len); | |
| 1105 ASSERT_HOST(box_word->length() == len); | |
| 1106 } | |
| 1107 } | |
| 1108 | |
| 1109 // Returns true if the collection of count pieces, starting at start, are all | |
| 1110 // natural connected components, ie there are no real chops involved. | |
| 1111 bool WERD_RES::PiecesAllNatural(int start, int count) const { | |
| 1112 // all seams must have no splits. | |
| 1113 for (int index = start; index < start + count - 1; ++index) { | |
| 1114 if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) { | |
| 1115 SEAM *seam = seam_array[index]; | |
| 1116 if (seam != nullptr && seam->HasAnySplits()) { | |
| 1117 return false; | |
| 1118 } | |
| 1119 } | |
| 1120 } | |
| 1121 return true; | |
| 1122 } | |
| 1123 | |
| 1124 WERD_RES::~WERD_RES() { | |
| 1125 Clear(); | |
| 1126 } | |
| 1127 | |
| 1128 void WERD_RES::Clear() { | |
| 1129 if (combination) { | |
| 1130 delete word; | |
| 1131 } | |
| 1132 word = nullptr; | |
| 1133 delete blamer_bundle; | |
| 1134 blamer_bundle = nullptr; | |
| 1135 ClearResults(); | |
| 1136 } | |
| 1137 | |
| 1138 void WERD_RES::ClearResults() { | |
| 1139 done = false; | |
| 1140 fontinfo = nullptr; | |
| 1141 fontinfo2 = nullptr; | |
| 1142 fontinfo_id_count = 0; | |
| 1143 fontinfo_id2_count = 0; | |
| 1144 delete bln_boxes; | |
| 1145 bln_boxes = nullptr; | |
| 1146 blob_row = nullptr; | |
| 1147 delete chopped_word; | |
| 1148 chopped_word = nullptr; | |
| 1149 delete rebuild_word; | |
| 1150 rebuild_word = nullptr; | |
| 1151 delete box_word; | |
| 1152 box_word = nullptr; | |
| 1153 best_state.clear(); | |
| 1154 correct_text.clear(); | |
| 1155 for (auto data : seam_array) { | |
| 1156 delete data; | |
| 1157 } | |
| 1158 seam_array.clear(); | |
| 1159 blob_widths.clear(); | |
| 1160 blob_gaps.clear(); | |
| 1161 ClearRatings(); | |
| 1162 ClearWordChoices(); | |
| 1163 if (blamer_bundle != nullptr) { | |
| 1164 blamer_bundle->ClearResults(); | |
| 1165 } | |
| 1166 } | |
| 1167 void WERD_RES::ClearWordChoices() { | |
| 1168 best_choice = nullptr; | |
| 1169 delete raw_choice; | |
| 1170 raw_choice = nullptr; | |
| 1171 best_choices.clear(); | |
| 1172 delete ep_choice; | |
| 1173 ep_choice = nullptr; | |
| 1174 } | |
| 1175 void WERD_RES::ClearRatings() { | |
| 1176 if (ratings != nullptr) { | |
| 1177 ratings->delete_matrix_pointers(); | |
| 1178 delete ratings; | |
| 1179 ratings = nullptr; | |
| 1180 } | |
| 1181 } | |
| 1182 | |
| 1183 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { | |
| 1184 ASSERT_HOST(page_res == other.page_res); | |
| 1185 if (other.block_res == nullptr) { | |
| 1186 // other points to the end of the page. | |
| 1187 if (block_res == nullptr) { | |
| 1188 return 0; | |
| 1189 } | |
| 1190 return -1; | |
| 1191 } | |
| 1192 if (block_res == nullptr) { | |
| 1193 return 1; // we point to the end of the page. | |
| 1194 } | |
| 1195 if (block_res == other.block_res) { | |
| 1196 if (other.row_res == nullptr || row_res == nullptr) { | |
| 1197 // this should only happen if we hit an image block. | |
| 1198 return 0; | |
| 1199 } | |
| 1200 if (row_res == other.row_res) { | |
| 1201 // we point to the same block and row. | |
| 1202 ASSERT_HOST(other.word_res != nullptr && word_res != nullptr); | |
| 1203 if (word_res == other.word_res) { | |
| 1204 // we point to the same word! | |
| 1205 return 0; | |
| 1206 } | |
| 1207 | |
| 1208 WERD_RES_IT word_res_it(&row_res->word_res_list); | |
| 1209 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); | |
| 1210 word_res_it.forward()) { | |
| 1211 if (word_res_it.data() == word_res) { | |
| 1212 return -1; | |
| 1213 } else if (word_res_it.data() == other.word_res) { | |
| 1214 return 1; | |
| 1215 } | |
| 1216 } | |
| 1217 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr); | |
| 1218 } | |
| 1219 | |
| 1220 // we both point to the same block, but different rows. | |
| 1221 ROW_RES_IT row_res_it(&block_res->row_res_list); | |
| 1222 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); | |
| 1223 row_res_it.forward()) { | |
| 1224 if (row_res_it.data() == row_res) { | |
| 1225 return -1; | |
| 1226 } else if (row_res_it.data() == other.row_res) { | |
| 1227 return 1; | |
| 1228 } | |
| 1229 } | |
| 1230 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr); | |
| 1231 } | |
| 1232 | |
| 1233 // We point to different blocks. | |
| 1234 BLOCK_RES_IT block_res_it(&page_res->block_res_list); | |
| 1235 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); | |
| 1236 block_res_it.forward()) { | |
| 1237 if (block_res_it.data() == block_res) { | |
| 1238 return -1; | |
| 1239 } else if (block_res_it.data() == other.block_res) { | |
| 1240 return 1; | |
| 1241 } | |
| 1242 } | |
| 1243 // Shouldn't happen... | |
| 1244 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr); | |
| 1245 return 0; | |
| 1246 } | |
| 1247 | |
| 1248 // Inserts the new_word as a combination owned by a corresponding WERD_RES | |
| 1249 // before the current position. The simple fields of the WERD_RES are copied | |
| 1250 // from clone_res and the resulting WERD_RES is returned for further setup | |
| 1251 // with best_choice etc. | |
| 1252 WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, | |
| 1253 WERD *new_word) { | |
| 1254 // Make a WERD_RES for the new_word. | |
| 1255 auto *new_res = new WERD_RES(new_word); | |
| 1256 new_res->CopySimpleFields(clone_res); | |
| 1257 new_res->combination = true; | |
| 1258 // Insert into the appropriate place in the ROW_RES. | |
| 1259 WERD_RES_IT wr_it(&row()->word_res_list); | |
| 1260 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { | |
| 1261 WERD_RES *word = wr_it.data(); | |
| 1262 if (word == word_res) { | |
| 1263 break; | |
| 1264 } | |
| 1265 } | |
| 1266 ASSERT_HOST(!wr_it.cycled_list()); | |
| 1267 wr_it.add_before_then_move(new_res); | |
| 1268 if (wr_it.at_first()) { | |
| 1269 // This is the new first word, so reset the member iterator so it | |
| 1270 // detects the cycled_list state correctly. | |
| 1271 ResetWordIterator(); | |
| 1272 } | |
| 1273 return new_res; | |
| 1274 } | |
| 1275 | |
| 1276 // Helper computes the boundaries between blobs in the word. The blob bounds | |
| 1277 // are likely very poor, if they come from LSTM, where it only outputs the | |
| 1278 // character at one pixel within it, so we find the midpoints between them. | |
| 1279 static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box, | |
| 1280 C_BLOB_LIST *next_word_blobs, | |
| 1281 std::vector<int> *blob_ends) { | |
| 1282 C_BLOB_IT blob_it(word.word->cblob_list()); | |
| 1283 for (int length : word.best_state) { | |
| 1284 // Get the bounding box of the fake blobs | |
| 1285 TBOX blob_box = blob_it.data()->bounding_box(); | |
| 1286 blob_it.forward(); | |
| 1287 for (int b = 1; b < length; ++b) { | |
| 1288 blob_box += blob_it.data()->bounding_box(); | |
| 1289 blob_it.forward(); | |
| 1290 } | |
| 1291 // This blob_box is crap, so for now we are only looking for the | |
| 1292 // boundaries between them. | |
| 1293 int blob_end = INT32_MAX; | |
| 1294 if (!blob_it.at_first() || next_word_blobs != nullptr) { | |
| 1295 if (blob_it.at_first()) { | |
| 1296 blob_it.set_to_list(next_word_blobs); | |
| 1297 } | |
| 1298 blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2; | |
| 1299 } | |
| 1300 blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right()); | |
| 1301 blob_ends->push_back(blob_end); | |
| 1302 } | |
| 1303 blob_ends->back() = clip_box.right(); | |
| 1304 } | |
| 1305 | |
| 1306 // Helper computes the bounds of a word by restricting it to existing words | |
| 1307 // that significantly overlap. | |
| 1308 static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words, | |
| 1309 int w_index, TBOX prev_box, WERD_RES_IT w_it) { | |
| 1310 constexpr int kSignificantOverlapFraction = 4; | |
| 1311 TBOX clipped_box; | |
| 1312 TBOX current_box = words[w_index]->word->bounding_box(); | |
| 1313 TBOX next_box; | |
| 1314 if (static_cast<size_t>(w_index + 1) < words.size() && | |
| 1315 words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) { | |
| 1316 next_box = words[w_index + 1]->word->bounding_box(); | |
| 1317 } | |
| 1318 for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo; | |
| 1319 w_it.forward()) { | |
| 1320 if (w_it.data() == nullptr || w_it.data()->word == nullptr) { | |
| 1321 continue; | |
| 1322 } | |
| 1323 TBOX w_box = w_it.data()->word->bounding_box(); | |
| 1324 int height_limit = std::min<int>(w_box.height(), w_box.width() / 2); | |
| 1325 int width_limit = w_box.width() / kSignificantOverlapFraction; | |
| 1326 int min_significant_overlap = std::max(height_limit, width_limit); | |
| 1327 int overlap = w_box.intersection(current_box).width(); | |
| 1328 int prev_overlap = w_box.intersection(prev_box).width(); | |
| 1329 int next_overlap = w_box.intersection(next_box).width(); | |
| 1330 if (overlap > min_significant_overlap) { | |
| 1331 if (prev_overlap > min_significant_overlap) { | |
| 1332 // We have no choice but to use the LSTM word edge. | |
| 1333 clipped_box.set_left(current_box.left()); | |
| 1334 } else if (next_overlap > min_significant_overlap) { | |
| 1335 // We have no choice but to use the LSTM word edge. | |
| 1336 clipped_box.set_right(current_box.right()); | |
| 1337 } else { | |
| 1338 clipped_box += w_box; | |
| 1339 } | |
| 1340 } | |
| 1341 } | |
| 1342 if (clipped_box.height() <= 0) { | |
| 1343 clipped_box.set_top(current_box.top()); | |
| 1344 clipped_box.set_bottom(current_box.bottom()); | |
| 1345 } | |
| 1346 if (clipped_box.width() <= 0) { | |
| 1347 clipped_box = current_box; | |
| 1348 } | |
| 1349 return clipped_box; | |
| 1350 } | |
| 1351 | |
| 1352 // Helper moves the blob from src to dest. If it isn't contained by clip_box, | |
| 1353 // the blob is replaced by a fake that is contained. | |
| 1354 static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it, | |
| 1355 const TBOX &clip_box) { | |
| 1356 C_BLOB *src_blob = src_it->extract(); | |
| 1357 TBOX box = src_blob->bounding_box(); | |
| 1358 if (!clip_box.contains(box)) { | |
| 1359 int left = | |
| 1360 ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1); | |
| 1361 int right = | |
| 1362 ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right()); | |
| 1363 int top = | |
| 1364 ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top()); | |
| 1365 int bottom = | |
| 1366 ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1); | |
| 1367 box = TBOX(left, bottom, right, top); | |
| 1368 delete src_blob; | |
| 1369 src_blob = C_BLOB::FakeBlob(box); | |
| 1370 } | |
| 1371 dest_it->add_after_then_move(src_blob); | |
| 1372 return box; | |
| 1373 } | |
| 1374 | |
| 1375 // Replaces the current WERD/WERD_RES with the given words. The given words | |
| 1376 // contain fake blobs that indicate the position of the characters. These are | |
| 1377 // replaced with real blobs from the current word as much as possible. | |
| 1378 void PAGE_RES_IT::ReplaceCurrentWord( | |
| 1379 tesseract::PointerVector<WERD_RES> *words) { | |
| 1380 if (words->empty()) { | |
| 1381 DeleteCurrentWord(); | |
| 1382 return; | |
| 1383 } | |
| 1384 WERD_RES *input_word = word(); | |
| 1385 // Set the BOL/EOL flags on the words from the input word. | |
| 1386 if (input_word->word->flag(W_BOL)) { | |
| 1387 (*words)[0]->word->set_flag(W_BOL, true); | |
| 1388 } else { | |
| 1389 (*words)[0]->word->set_blanks(input_word->word->space()); | |
| 1390 } | |
| 1391 words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL)); | |
| 1392 | |
| 1393 // Move the blobs from the input word to the new set of words. | |
| 1394 // If the input word_res is a combination, then the replacements will also be | |
| 1395 // combinations, and will own their own words. If the input word_res is not a | |
| 1396 // combination, then the final replacements will not be either, (although it | |
| 1397 // is allowed for the input words to be combinations) and their words | |
| 1398 // will get put on the row list. This maintains the ownership rules. | |
| 1399 WERD_IT w_it(row()->row->word_list()); | |
| 1400 if (!input_word->combination) { | |
| 1401 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 1402 WERD *word = w_it.data(); | |
| 1403 if (word == input_word->word) { | |
| 1404 break; | |
| 1405 } | |
| 1406 } | |
| 1407 // w_it is now set to the input_word's word. | |
| 1408 ASSERT_HOST(!w_it.cycled_list()); | |
| 1409 } | |
| 1410 // Insert into the appropriate place in the ROW_RES. | |
| 1411 WERD_RES_IT wr_it(&row()->word_res_list); | |
| 1412 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { | |
| 1413 WERD_RES *word = wr_it.data(); | |
| 1414 if (word == input_word) { | |
| 1415 break; | |
| 1416 } | |
| 1417 } | |
| 1418 ASSERT_HOST(!wr_it.cycled_list()); | |
| 1419 // Since we only have an estimate of the bounds between blobs, use the blob | |
| 1420 // x-middle as the determiner of where to put the blobs | |
| 1421 C_BLOB_IT src_b_it(input_word->word->cblob_list()); | |
| 1422 src_b_it.sort(&C_BLOB::SortByXMiddle); | |
| 1423 C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list()); | |
| 1424 rej_b_it.sort(&C_BLOB::SortByXMiddle); | |
| 1425 TBOX clip_box; | |
| 1426 for (size_t w = 0; w < words->size(); ++w) { | |
| 1427 WERD_RES *word_w = (*words)[w]; | |
| 1428 clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word); | |
| 1429 // Compute blob boundaries. | |
| 1430 std::vector<int> blob_ends; | |
| 1431 C_BLOB_LIST *next_word_blobs = | |
| 1432 w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr; | |
| 1433 ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends); | |
| 1434 // Remove the fake blobs on the current word, but keep safe for back-up if | |
| 1435 // no blob can be found. | |
| 1436 C_BLOB_LIST fake_blobs; | |
| 1437 C_BLOB_IT fake_b_it(&fake_blobs); | |
| 1438 fake_b_it.add_list_after(word_w->word->cblob_list()); | |
| 1439 fake_b_it.move_to_first(); | |
| 1440 word_w->word->cblob_list()->clear(); | |
| 1441 C_BLOB_IT dest_it(word_w->word->cblob_list()); | |
| 1442 // Build the box word as we move the blobs. | |
| 1443 auto *box_word = new tesseract::BoxWord; | |
| 1444 for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) { | |
| 1445 int end_x = blob_ends[i]; | |
| 1446 TBOX blob_box; | |
| 1447 // Add the blobs up to end_x. | |
| 1448 while (!src_b_it.empty() && | |
| 1449 src_b_it.data()->bounding_box().x_middle() < end_x) { | |
| 1450 blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box); | |
| 1451 src_b_it.forward(); | |
| 1452 } | |
| 1453 while (!rej_b_it.empty() && | |
| 1454 rej_b_it.data()->bounding_box().x_middle() < end_x) { | |
| 1455 blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box); | |
| 1456 rej_b_it.forward(); | |
| 1457 } | |
| 1458 if (blob_box.null_box()) { | |
| 1459 // Use the original box as a back-up. | |
| 1460 blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box); | |
| 1461 } | |
| 1462 box_word->InsertBox(i, blob_box); | |
| 1463 } | |
| 1464 delete word_w->box_word; | |
| 1465 word_w->box_word = box_word; | |
| 1466 if (!input_word->combination) { | |
| 1467 // Insert word_w->word into the ROW. It doesn't own its word, so the | |
| 1468 // ROW needs to own it. | |
| 1469 w_it.add_before_stay_put(word_w->word); | |
| 1470 word_w->combination = false; | |
| 1471 } | |
| 1472 (*words)[w] = nullptr; // We are taking ownership. | |
| 1473 wr_it.add_before_stay_put(word_w); | |
| 1474 } | |
| 1475 // We have taken ownership of the words. | |
| 1476 words->clear(); | |
| 1477 // Delete the current word, which has been replaced. We could just call | |
| 1478 // DeleteCurrentWord, but that would iterate both lists again, and we know | |
| 1479 // we are already in the right place. | |
| 1480 if (!input_word->combination) { | |
| 1481 delete w_it.extract(); | |
| 1482 } | |
| 1483 delete wr_it.extract(); | |
| 1484 ResetWordIterator(); | |
| 1485 } | |
| 1486 | |
| 1487 // Deletes the current WERD_RES and its underlying WERD. | |
| 1488 void PAGE_RES_IT::DeleteCurrentWord() { | |
| 1489 // Check that this word is as we expect. part_of_combos are NEVER iterated | |
| 1490 // by the normal iterator, so we should never be trying to delete them. | |
| 1491 ASSERT_HOST(!word_res->part_of_combo); | |
| 1492 if (!word_res->combination) { | |
| 1493 // Combinations own their own word, so we won't find the word on the | |
| 1494 // row's word_list, but it is legitimate to try to delete them. | |
| 1495 // Delete word from the ROW when not a combination. | |
| 1496 WERD_IT w_it(row()->row->word_list()); | |
| 1497 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 1498 if (w_it.data() == word_res->word) { | |
| 1499 break; | |
| 1500 } | |
| 1501 } | |
| 1502 ASSERT_HOST(!w_it.cycled_list()); | |
| 1503 delete w_it.extract(); | |
| 1504 } | |
| 1505 // Remove the WERD_RES for the new_word. | |
| 1506 // Remove the WORD_RES from the ROW_RES. | |
| 1507 WERD_RES_IT wr_it(&row()->word_res_list); | |
| 1508 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { | |
| 1509 if (wr_it.data() == word_res) { | |
| 1510 word_res = nullptr; | |
| 1511 break; | |
| 1512 } | |
| 1513 } | |
| 1514 ASSERT_HOST(!wr_it.cycled_list()); | |
| 1515 delete wr_it.extract(); | |
| 1516 ResetWordIterator(); | |
| 1517 } | |
| 1518 | |
| 1519 // Makes the current word a fuzzy space if not already fuzzy. Updates | |
| 1520 // corresponding part of combo if required. | |
| 1521 void PAGE_RES_IT::MakeCurrentWordFuzzy() { | |
| 1522 WERD *real_word = word_res->word; | |
| 1523 if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { | |
| 1524 real_word->set_flag(W_FUZZY_SP, true); | |
| 1525 if (word_res->combination) { | |
| 1526 // The next word should be the corresponding part of combo, but we have | |
| 1527 // already stepped past it, so find it by search. | |
| 1528 WERD_RES_IT wr_it(&row()->word_res_list); | |
| 1529 for (wr_it.mark_cycle_pt(); | |
| 1530 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) { | |
| 1531 } | |
| 1532 wr_it.forward(); | |
| 1533 ASSERT_HOST(wr_it.data()->part_of_combo); | |
| 1534 real_word = wr_it.data()->word; | |
| 1535 ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && | |
| 1536 !real_word->flag(W_FUZZY_NON)); | |
| 1537 real_word->set_flag(W_FUZZY_SP, true); | |
| 1538 } | |
| 1539 } | |
| 1540 } | |
| 1541 | |
| 1542 /************************************************************************* | |
| 1543 * PAGE_RES_IT::restart_page | |
| 1544 * | |
| 1545 * Set things up at the start of the page | |
| 1546 *************************************************************************/ | |
| 1547 | |
| 1548 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) { | |
| 1549 block_res_it.set_to_list(&page_res->block_res_list); | |
| 1550 block_res_it.mark_cycle_pt(); | |
| 1551 prev_block_res = nullptr; | |
| 1552 prev_row_res = nullptr; | |
| 1553 prev_word_res = nullptr; | |
| 1554 block_res = nullptr; | |
| 1555 row_res = nullptr; | |
| 1556 word_res = nullptr; | |
| 1557 next_block_res = nullptr; | |
| 1558 next_row_res = nullptr; | |
| 1559 next_word_res = nullptr; | |
| 1560 internal_forward(true, empty_ok); | |
| 1561 return internal_forward(false, empty_ok); | |
| 1562 } | |
| 1563 | |
| 1564 // Recovers from operations on the current word, such as in InsertCloneWord | |
| 1565 // and DeleteCurrentWord. | |
| 1566 // Resets the word_res_it so that it is one past the next_word_res, as | |
| 1567 // it should be after internal_forward. If next_row_res != row_res, | |
| 1568 // then the next_word_res is in the next row, so there is no need to do | |
| 1569 // anything to word_res_it, but it is still a good idea to reset the pointers | |
| 1570 // word_res and prev_word_res, which are still in the current row. | |
| 1571 void PAGE_RES_IT::ResetWordIterator() { | |
| 1572 if (row_res == next_row_res) { | |
| 1573 // Reset the member iterator so it can move forward and detect the | |
| 1574 // cycled_list state correctly. | |
| 1575 word_res_it.move_to_first(); | |
| 1576 for (word_res_it.mark_cycle_pt(); | |
| 1577 !word_res_it.cycled_list() && word_res_it.data() != next_word_res; | |
| 1578 word_res_it.forward()) { | |
| 1579 if (!word_res_it.data()->part_of_combo) { | |
| 1580 if (prev_row_res == row_res) { | |
| 1581 prev_word_res = word_res; | |
| 1582 } | |
| 1583 word_res = word_res_it.data(); | |
| 1584 } | |
| 1585 } | |
| 1586 ASSERT_HOST(!word_res_it.cycled_list()); | |
| 1587 wr_it_of_next_word = word_res_it; | |
| 1588 word_res_it.forward(); | |
| 1589 } else { | |
| 1590 // word_res_it is OK, but reset word_res and prev_word_res if needed. | |
| 1591 WERD_RES_IT wr_it(&row_res->word_res_list); | |
| 1592 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { | |
| 1593 if (!wr_it.data()->part_of_combo) { | |
| 1594 if (prev_row_res == row_res) { | |
| 1595 prev_word_res = word_res; | |
| 1596 } | |
| 1597 word_res = wr_it.data(); | |
| 1598 } | |
| 1599 } | |
| 1600 } | |
| 1601 } | |
| 1602 | |
| 1603 /************************************************************************* | |
| 1604 * PAGE_RES_IT::internal_forward | |
| 1605 * | |
| 1606 * Find the next word on the page. If empty_ok is true, then non-text blocks | |
| 1607 * and text blocks with no text are visited as if they contain a single | |
| 1608 * imaginary word in a single imaginary row. (word() and row() both return | |
| 1609 *nullptr in such a block and the return value is nullptr.) If empty_ok is | |
| 1610 *false, the old behaviour is maintained. Each real word is visited and empty | |
| 1611 *and non-text blocks and rows are skipped. new_block is used to initialize the | |
| 1612 *iterators for a new block. The iterator maintains pointers to block, row and | |
| 1613 *word for the previous, current and next words. These are correct, regardless | |
| 1614 *of block/row boundaries. nullptr values denote start and end of the page. | |
| 1615 *************************************************************************/ | |
| 1616 | |
| 1617 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) { | |
| 1618 bool new_row = false; | |
| 1619 | |
| 1620 prev_block_res = block_res; | |
| 1621 prev_row_res = row_res; | |
| 1622 prev_word_res = word_res; | |
| 1623 block_res = next_block_res; | |
| 1624 row_res = next_row_res; | |
| 1625 word_res = next_word_res; | |
| 1626 wr_it_of_current_word = wr_it_of_next_word; | |
| 1627 next_block_res = nullptr; | |
| 1628 next_row_res = nullptr; | |
| 1629 next_word_res = nullptr; | |
| 1630 | |
| 1631 while (!block_res_it.cycled_list()) { | |
| 1632 if (new_block) { | |
| 1633 new_block = false; | |
| 1634 row_res_it.set_to_list(&block_res_it.data()->row_res_list); | |
| 1635 row_res_it.mark_cycle_pt(); | |
| 1636 if (row_res_it.empty() && empty_ok) { | |
| 1637 next_block_res = block_res_it.data(); | |
| 1638 break; | |
| 1639 } | |
| 1640 new_row = true; | |
| 1641 } | |
| 1642 while (!row_res_it.cycled_list()) { | |
| 1643 if (new_row) { | |
| 1644 new_row = false; | |
| 1645 word_res_it.set_to_list(&row_res_it.data()->word_res_list); | |
| 1646 word_res_it.mark_cycle_pt(); | |
| 1647 } | |
| 1648 // Skip any part_of_combo words. | |
| 1649 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) { | |
| 1650 word_res_it.forward(); | |
| 1651 } | |
| 1652 if (!word_res_it.cycled_list()) { | |
| 1653 next_block_res = block_res_it.data(); | |
| 1654 next_row_res = row_res_it.data(); | |
| 1655 next_word_res = word_res_it.data(); | |
| 1656 wr_it_of_next_word = word_res_it; | |
| 1657 word_res_it.forward(); | |
| 1658 goto foundword; | |
| 1659 } | |
| 1660 // end of row reached | |
| 1661 row_res_it.forward(); | |
| 1662 new_row = true; | |
| 1663 } | |
| 1664 // end of block reached | |
| 1665 block_res_it.forward(); | |
| 1666 new_block = true; | |
| 1667 } | |
| 1668 foundword: | |
| 1669 // Update prev_word_best_choice pointer. | |
| 1670 if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) { | |
| 1671 *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr) | |
| 1672 ? nullptr | |
| 1673 : prev_word_res->best_choice; | |
| 1674 } | |
| 1675 return word_res; | |
| 1676 } | |
| 1677 | |
| 1678 /************************************************************************* | |
| 1679 * PAGE_RES_IT::restart_row() | |
| 1680 * | |
| 1681 * Move to the beginning (leftmost word) of the current row. | |
| 1682 *************************************************************************/ | |
| 1683 WERD_RES *PAGE_RES_IT::restart_row() { | |
| 1684 ROW_RES *row = this->row(); | |
| 1685 if (!row) { | |
| 1686 return nullptr; | |
| 1687 } | |
| 1688 for (restart_page(); this->row() != row; forward()) { | |
| 1689 // pass | |
| 1690 } | |
| 1691 return word(); | |
| 1692 } | |
| 1693 | |
| 1694 /************************************************************************* | |
| 1695 * PAGE_RES_IT::forward_paragraph | |
| 1696 * | |
| 1697 * Move to the beginning of the next paragraph, allowing empty blocks. | |
| 1698 *************************************************************************/ | |
| 1699 | |
| 1700 WERD_RES *PAGE_RES_IT::forward_paragraph() { | |
| 1701 while (block_res == next_block_res && | |
| 1702 (next_row_res != nullptr && next_row_res->row != nullptr && | |
| 1703 row_res->row->para() == next_row_res->row->para())) { | |
| 1704 internal_forward(false, true); | |
| 1705 } | |
| 1706 return internal_forward(false, true); | |
| 1707 } | |
| 1708 | |
| 1709 /************************************************************************* | |
| 1710 * PAGE_RES_IT::forward_block | |
| 1711 * | |
| 1712 * Move to the beginning of the next block, allowing empty blocks. | |
| 1713 *************************************************************************/ | |
| 1714 | |
| 1715 WERD_RES *PAGE_RES_IT::forward_block() { | |
| 1716 while (block_res == next_block_res) { | |
| 1717 internal_forward(false, true); | |
| 1718 } | |
| 1719 return internal_forward(false, true); | |
| 1720 } | |
| 1721 | |
| 1722 void PAGE_RES_IT::rej_stat_word() { | |
| 1723 int16_t chars_in_word; | |
| 1724 int16_t rejects_in_word = 0; | |
| 1725 | |
| 1726 chars_in_word = word_res->reject_map.length(); | |
| 1727 page_res->char_count += chars_in_word; | |
| 1728 block_res->char_count += chars_in_word; | |
| 1729 row_res->char_count += chars_in_word; | |
| 1730 | |
| 1731 rejects_in_word = word_res->reject_map.reject_count(); | |
| 1732 | |
| 1733 page_res->rej_count += rejects_in_word; | |
| 1734 block_res->rej_count += rejects_in_word; | |
| 1735 row_res->rej_count += rejects_in_word; | |
| 1736 if (chars_in_word == rejects_in_word) { | |
| 1737 row_res->whole_word_rej_count += rejects_in_word; | |
| 1738 } | |
| 1739 } | |
| 1740 | |
| 1741 } // namespace tesseract |
