Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/applybox.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: applybox.cpp (Formerly applybox.c) | |
| 3 * Description: Re segment rows according to box file data | |
| 4 * Author: Phil Cheatle | |
| 5 * | |
| 6 * (C) Copyright 1993, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #ifndef DISABLED_LEGACY_ENGINE | |
| 20 # include <allheaders.h> | |
| 21 # include <cctype> | |
| 22 # include <cerrno> | |
| 23 # include <cstring> | |
| 24 # include "boxread.h" | |
| 25 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 26 #include <tesseract/unichar.h> | |
| 27 #include "pageres.h" | |
| 28 #include "tesseractclass.h" | |
| 29 #include "tesserrstream.h" // for tesserr | |
| 30 #include "unicharset.h" | |
| 31 | |
| 32 #ifndef DISABLED_LEGACY_ENGINE | |
| 33 /** Max number of blobs to classify together in FindSegmentation. */ | |
| 34 const int kMaxGroupSize = 4; | |
| 35 /// Max fraction of median allowed as deviation in xheight before switching | |
| 36 /// to median. | |
| 37 const double kMaxXHeightDeviationFraction = 0.125; | |
| 38 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 39 | |
| 40 /** | |
| 41 * The box file is assumed to contain box definitions, one per line, of the | |
| 42 * following format for blob-level boxes: | |
| 43 * @verbatim | |
| 44 * <UTF8 str> <left> <bottom> <right> <top> <page id> | |
| 45 * @endverbatim | |
| 46 * and for word/line-level boxes: | |
| 47 * @verbatim | |
| 48 * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> | |
| 49 * @endverbatim | |
| 50 * NOTES: | |
| 51 * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT. | |
| 52 * | |
| 53 * <page id> is 0-based, and the page number is used for multipage input (tiff). | |
| 54 * | |
| 55 * In the blob-level form, each line represents a recognizable unit, which may | |
| 56 * be several UTF-8 bytes, but there is a bounding box around each recognizable | |
| 57 * unit, and no classifier is needed to train in this mode (bootstrapping.) | |
| 58 * | |
| 59 * In the word/line-level form, the line begins with the literal "WordStr", and | |
| 60 * the bounding box bounds either a whole line or a whole word. The recognizable | |
| 61 * units in the word/line are listed after the # at the end of the line and | |
| 62 * are space delimited, ignoring any original spaces on the line. | |
| 63 * Eg. | |
| 64 * @verbatim | |
| 65 * word -> #w o r d | |
| 66 * multi word line -> #m u l t i w o r d l i n e | |
| 67 * @endverbatim | |
| 68 * The recognizable units must be space-delimited in order to allow multiple | |
| 69 * unicodes to be used for a single recognizable unit, eg Hindi. | |
| 70 * | |
| 71 * In this mode, the classifier must have been pre-trained with the desired | |
| 72 * character set, or it will not be able to find the character segmentations. | |
| 73 */ | |
| 74 | |
| 75 namespace tesseract { | |
| 76 | |
| 77 #ifndef DISABLED_LEGACY_ENGINE | |
| 78 static void clear_any_old_text(BLOCK_LIST *block_list) { | |
| 79 BLOCK_IT block_it(block_list); | |
| 80 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 81 ROW_IT row_it(block_it.data()->row_list()); | |
| 82 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 83 WERD_IT word_it(row_it.data()->word_list()); | |
| 84 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 85 word_it.data()->set_text(""); | |
| 86 } | |
| 87 } | |
| 88 } | |
| 89 } | |
| 90 | |
| 91 // Applies the box file based on the image name filename, and resegments | |
| 92 // the words in the block_list (page), with: | |
| 93 // blob-mode: one blob per line in the box file, words as input. | |
| 94 // word/line-mode: one blob per space-delimited unit after the #, and one word | |
| 95 // per line in the box file. (See comment above for box file format.) | |
| 96 // If find_segmentation is true, (word/line mode) then the classifier is used | |
| 97 // to re-segment words/lines to match the space-delimited truth string for | |
| 98 // each box. In this case, the input box may be for a word or even a whole | |
| 99 // text line, and the output words will contain multiple blobs corresponding | |
| 100 // to the space-delimited input string. | |
| 101 // With find_segmentation false, no classifier is needed, but the chopper | |
| 102 // can still be used to correctly segment touching characters with the help | |
| 103 // of the input boxes. | |
| 104 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned | |
| 105 // from normal classification, ie. with a word, chopped_word, rebuild_word, | |
| 106 // seam_array, denorm, box_word, and best_state, but NO best_choice or | |
| 107 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. | |
| 108 // Instead, the correct_text member of WERD_RES is set, and this may be later | |
| 109 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords | |
| 110 // is not required before calling ApplyBoxTraining. | |
| 111 PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation, | |
| 112 BLOCK_LIST *block_list) { | |
| 113 std::vector<TBOX> boxes; | |
| 114 std::vector<std::string> texts, full_texts; | |
| 115 if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) { | |
| 116 return nullptr; // Can't do it. | |
| 117 } | |
| 118 | |
| 119 const int box_count = boxes.size(); | |
| 120 int box_failures = 0; | |
| 121 | |
| 122 // In word mode, we use the boxes to make a word for each box, but | |
| 123 // in blob mode we use the existing words and maximally chop them first. | |
| 124 PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list); | |
| 125 clear_any_old_text(block_list); | |
| 126 | |
| 127 for (int i = 0; i < box_count; i++) { | |
| 128 bool foundit = false; | |
| 129 if (page_res != nullptr) { | |
| 130 foundit = | |
| 131 ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i], | |
| 132 (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str()); | |
| 133 } else { | |
| 134 foundit = ResegmentWordBox(block_list, boxes[i], | |
| 135 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str()); | |
| 136 } | |
| 137 if (!foundit) { | |
| 138 box_failures++; | |
| 139 ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob"); | |
| 140 } | |
| 141 } | |
| 142 | |
| 143 if (page_res == nullptr) { | |
| 144 // In word/line mode, we now maximally chop all the words and resegment | |
| 145 // them with the classifier. | |
| 146 page_res = SetupApplyBoxes(boxes, block_list); | |
| 147 ReSegmentByClassification(page_res); | |
| 148 } | |
| 149 if (applybox_debug > 0) { | |
| 150 tprintf("APPLY_BOXES:\n"); | |
| 151 tprintf(" Boxes read from boxfile: %6d\n", box_count); | |
| 152 if (box_failures > 0) { | |
| 153 tprintf(" Boxes failed resegmentation: %6d\n", box_failures); | |
| 154 } | |
| 155 } | |
| 156 TidyUp(page_res); | |
| 157 return page_res; | |
| 158 } | |
| 159 | |
| 160 // Helper computes median xheight in the image. | |
| 161 static double MedianXHeight(BLOCK_LIST *block_list) { | |
| 162 BLOCK_IT block_it(block_list); | |
| 163 STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1); | |
| 164 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 165 ROW_IT row_it(block_it.data()->row_list()); | |
| 166 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 167 xheights.add(IntCastRounded(row_it.data()->x_height()), 1); | |
| 168 } | |
| 169 } | |
| 170 return xheights.median(); | |
| 171 } | |
| 172 | |
| 173 /// Any row xheight that is significantly different from the median is set | |
| 174 /// to the median. | |
| 175 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) { | |
| 176 const double median_xheight = MedianXHeight(block_list); | |
| 177 const double max_deviation = kMaxXHeightDeviationFraction * median_xheight; | |
| 178 // Strip all fuzzy space markers to simplify the PAGE_RES. | |
| 179 BLOCK_IT b_it(block_list); | |
| 180 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 181 BLOCK *block = b_it.data(); | |
| 182 ROW_IT r_it(block->row_list()); | |
| 183 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { | |
| 184 ROW *row = r_it.data(); | |
| 185 const double diff = fabs(row->x_height() - median_xheight); | |
| 186 if (diff > max_deviation) { | |
| 187 if (applybox_debug) { | |
| 188 tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight); | |
| 189 } | |
| 190 row->set_x_height(static_cast<float>(median_xheight)); | |
| 191 } | |
| 192 } | |
| 193 } | |
| 194 } | |
| 195 | |
| 196 /// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: | |
| 197 /// All fuzzy spaces are removed, and all the words are maximally chopped. | |
| 198 PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) { | |
| 199 PreenXHeights(block_list); | |
| 200 // Strip all fuzzy space markers to simplify the PAGE_RES. | |
| 201 BLOCK_IT b_it(block_list); | |
| 202 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 203 BLOCK *block = b_it.data(); | |
| 204 ROW_IT r_it(block->row_list()); | |
| 205 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { | |
| 206 ROW *row = r_it.data(); | |
| 207 WERD_IT w_it(row->word_list()); | |
| 208 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 209 WERD *word = w_it.data(); | |
| 210 if (word->cblob_list()->empty()) { | |
| 211 delete w_it.extract(); | |
| 212 } else { | |
| 213 word->set_flag(W_FUZZY_SP, false); | |
| 214 word->set_flag(W_FUZZY_NON, false); | |
| 215 } | |
| 216 } | |
| 217 } | |
| 218 } | |
| 219 auto *page_res = new PAGE_RES(false, block_list, nullptr); | |
| 220 PAGE_RES_IT pr_it(page_res); | |
| 221 WERD_RES *word_res; | |
| 222 while ((word_res = pr_it.word()) != nullptr) { | |
| 223 MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res); | |
| 224 pr_it.forward(); | |
| 225 } | |
| 226 return page_res; | |
| 227 } | |
| 228 | |
| 229 /// Tests the chopper by exhaustively running chop_one_blob. | |
| 230 /// The word_res will contain filled chopped_word, seam_array, denorm, | |
| 231 /// box_word and best_state for the maximally chopped word. | |
| 232 void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row, | |
| 233 WERD_RES *word_res) { | |
| 234 if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr, | |
| 235 classify_bln_numeric_mode, textord_use_cjk_fp_model, | |
| 236 poly_allow_detailed_fx, row, block)) { | |
| 237 word_res->CloneChoppedToRebuild(); | |
| 238 return; | |
| 239 } | |
| 240 if (chop_debug) { | |
| 241 tprintf("Maximally chopping word at:"); | |
| 242 word_res->word->bounding_box().print(); | |
| 243 } | |
| 244 std::vector<BLOB_CHOICE *> blob_choices; | |
| 245 ASSERT_HOST(!word_res->chopped_word->blobs.empty()); | |
| 246 auto rating = static_cast<float>(INT8_MAX); | |
| 247 for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) { | |
| 248 // The rating and certainty are not quite arbitrary. Since | |
| 249 // select_blob_to_chop uses the worst certainty to choose, they all have | |
| 250 // to be different, so starting with INT8_MAX, subtract 1/8 for each blob | |
| 251 // in here, and then divide by e each time they are chopped, which | |
| 252 // should guarantee a set of unequal values for the whole tree of blobs | |
| 253 // produced, however much chopping is required. The chops are thus only | |
| 254 // limited by the ability of the chopper to find suitable chop points, | |
| 255 // and not by the value of the certainties. | |
| 256 auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE); | |
| 257 blob_choices.push_back(choice); | |
| 258 rating -= 0.125f; | |
| 259 } | |
| 260 const double e = exp(1.0); // The base of natural logs. | |
| 261 unsigned blob_number; | |
| 262 if (!assume_fixed_pitch_char_segment) { | |
| 263 // We only chop if the language is not fixed pitch like CJK. | |
| 264 SEAM *seam = nullptr; | |
| 265 int right_chop_index = 0; | |
| 266 while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) { | |
| 267 word_res->InsertSeam(blob_number, seam); | |
| 268 BLOB_CHOICE *left_choice = blob_choices[blob_number]; | |
| 269 rating = left_choice->rating() / e; | |
| 270 left_choice->set_rating(rating); | |
| 271 left_choice->set_certainty(-rating); | |
| 272 // combine confidence w/ serial # | |
| 273 auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f, | |
| 274 0.0f, 0.0f, BCC_FAKE); | |
| 275 blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice); | |
| 276 } | |
| 277 } | |
| 278 word_res->CloneChoppedToRebuild(); | |
| 279 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]); | |
| 280 } | |
| 281 | |
| 282 /// Helper to compute the dispute resolution metric. | |
| 283 /// Disputed blob resolution. The aim is to give the blob to the most | |
| 284 /// appropriate boxfile box. Most of the time it is obvious, but if | |
| 285 /// two boxfile boxes overlap significantly it is not. If a small boxfile | |
| 286 /// box takes most of the blob, and a large boxfile box does too, then | |
| 287 /// we want the small boxfile box to get it, but if the small box | |
| 288 /// is much smaller than the blob, we don't want it to get it. | |
| 289 /// Details of the disputed blob resolution: | |
| 290 /// Given a box with area A, and a blob with area B, with overlap area C, | |
| 291 /// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum | |
| 292 /// miss metric gets the blob. | |
| 293 static double BoxMissMetric(const TBOX &box1, const TBOX &box2) { | |
| 294 const int overlap_area = box1.intersection(box2).area(); | |
| 295 const int a = box1.area(); | |
| 296 const int b = box2.area(); | |
| 297 ASSERT_HOST(a != 0 && b != 0); | |
| 298 return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b; | |
| 299 } | |
| 300 | |
| 301 /// Gather consecutive blobs that match the given box into the best_state | |
| 302 /// and corresponding correct_text. | |
| 303 /// | |
| 304 /// Fights over which box owns which blobs are settled by pre-chopping and | |
| 305 /// applying the blobs to box or next_box with the least non-overlap. | |
| 306 /// @return false if the box was in error, which can only be caused by | |
| 307 /// failing to find an appropriate blob for a box. | |
| 308 /// | |
| 309 /// This means that occasionally, blobs may be incorrectly segmented if the | |
| 310 /// chopper fails to find a suitable chop point. | |
| 311 bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, | |
| 312 const TBOX *next_box, const char *correct_text) { | |
| 313 if (applybox_debug > 1) { | |
| 314 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); | |
| 315 } | |
| 316 PAGE_RES_IT page_res_it(page_res); | |
| 317 WERD_RES *word_res; | |
| 318 for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) { | |
| 319 if (!word_res->box_word->bounding_box().major_overlap(box)) { | |
| 320 continue; | |
| 321 } | |
| 322 if (applybox_debug > 1) { | |
| 323 tprintf("Checking word box:"); | |
| 324 word_res->box_word->bounding_box().print(); | |
| 325 } | |
| 326 int word_len = word_res->box_word->length(); | |
| 327 for (int i = 0; i < word_len; ++i) { | |
| 328 TBOX char_box = TBOX(); | |
| 329 int blob_count = 0; | |
| 330 for (blob_count = 0; i + blob_count < word_len; ++blob_count) { | |
| 331 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); | |
| 332 if (!blob_box.major_overlap(box)) { | |
| 333 break; | |
| 334 } | |
| 335 if (word_res->correct_text[i + blob_count].length() > 0) { | |
| 336 break; // Blob is claimed already. | |
| 337 } | |
| 338 if (next_box != nullptr) { | |
| 339 const double current_box_miss_metric = BoxMissMetric(blob_box, box); | |
| 340 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box); | |
| 341 if (applybox_debug > 2) { | |
| 342 tprintf("Checking blob:"); | |
| 343 blob_box.print(); | |
| 344 tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, | |
| 345 next_box_miss_metric); | |
| 346 } | |
| 347 if (current_box_miss_metric > next_box_miss_metric) { | |
| 348 break; // Blob is a better match for next box. | |
| 349 } | |
| 350 } | |
| 351 char_box += blob_box; | |
| 352 } | |
| 353 if (blob_count > 0) { | |
| 354 if (applybox_debug > 1) { | |
| 355 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); | |
| 356 } | |
| 357 if (!char_box.almost_equal(box, 3) && | |
| 358 ((next_box != nullptr && box.x_gap(*next_box) < -3) || | |
| 359 (prev_box != nullptr && prev_box->x_gap(box) < -3))) { | |
| 360 return false; | |
| 361 } | |
| 362 // We refine just the box_word, best_state and correct_text here. | |
| 363 // The rebuild_word is made in TidyUp. | |
| 364 // blob_count blobs are put together to match the box. Merge the | |
| 365 // box_word boxes, save the blob_count in the state and the text. | |
| 366 word_res->box_word->MergeBoxes(i, i + blob_count); | |
| 367 word_res->best_state[i] = blob_count; | |
| 368 word_res->correct_text[i] = correct_text; | |
| 369 if (applybox_debug > 2) { | |
| 370 tprintf("%d Blobs match: blob box:", blob_count); | |
| 371 word_res->box_word->BlobBox(i).print(); | |
| 372 tprintf("Matches box:"); | |
| 373 box.print(); | |
| 374 if (next_box != nullptr) { | |
| 375 tprintf("With next box:"); | |
| 376 next_box->print(); | |
| 377 } | |
| 378 } | |
| 379 // Eliminated best_state and correct_text entries for the consumed | |
| 380 // blobs. | |
| 381 for (int j = 1; j < blob_count; ++j) { | |
| 382 word_res->best_state.erase(word_res->best_state.begin() + i + 1); | |
| 383 word_res->correct_text.erase(word_res->correct_text.begin() + i + 1); | |
| 384 } | |
| 385 // Assume that no box spans multiple source words, so we are done with | |
| 386 // this box. | |
| 387 if (applybox_debug > 1) { | |
| 388 tprintf("Best state = "); | |
| 389 for (auto best_state : word_res->best_state) { | |
| 390 tprintf("%d ", best_state); | |
| 391 } | |
| 392 tprintf("\n"); | |
| 393 tprintf("Correct text = [[ "); | |
| 394 for (auto &it : word_res->correct_text) { | |
| 395 tprintf("%s ", it.c_str()); | |
| 396 } | |
| 397 tprintf("]]\n"); | |
| 398 } | |
| 399 return true; | |
| 400 } | |
| 401 } | |
| 402 } | |
| 403 if (applybox_debug > 0) { | |
| 404 tprintf("FAIL!\n"); | |
| 405 } | |
| 406 return false; // Failure. | |
| 407 } | |
| 408 | |
| 409 /// Consume all source blobs that strongly overlap the given box, | |
| 410 /// putting them into a new word, with the correct_text label. | |
| 411 /// Fights over which box owns which blobs are settled by | |
| 412 /// applying the blobs to box or next_box with the least non-overlap. | |
| 413 /// @return false if the box was in error, which can only be caused by | |
| 414 /// failing to find an overlapping blob for a box. | |
| 415 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, | |
| 416 const char *correct_text) { | |
| 417 if (applybox_debug > 1) { | |
| 418 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); | |
| 419 } | |
| 420 WERD *new_word = nullptr; | |
| 421 BLOCK_IT b_it(block_list); | |
| 422 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 423 BLOCK *block = b_it.data(); | |
| 424 if (!box.major_overlap(block->pdblk.bounding_box())) { | |
| 425 continue; | |
| 426 } | |
| 427 ROW_IT r_it(block->row_list()); | |
| 428 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { | |
| 429 ROW *row = r_it.data(); | |
| 430 if (!box.major_overlap(row->bounding_box())) { | |
| 431 continue; | |
| 432 } | |
| 433 WERD_IT w_it(row->word_list()); | |
| 434 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 435 WERD *word = w_it.data(); | |
| 436 if (applybox_debug > 2) { | |
| 437 tprintf("Checking word:"); | |
| 438 word->bounding_box().print(); | |
| 439 } | |
| 440 if (word->text() != nullptr && word->text()[0] != '\0') { | |
| 441 continue; // Ignore words that are already done. | |
| 442 } | |
| 443 if (!box.major_overlap(word->bounding_box())) { | |
| 444 continue; | |
| 445 } | |
| 446 C_BLOB_IT blob_it(word->cblob_list()); | |
| 447 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 448 C_BLOB *blob = blob_it.data(); | |
| 449 TBOX blob_box = blob->bounding_box(); | |
| 450 if (!blob_box.major_overlap(box)) { | |
| 451 continue; | |
| 452 } | |
| 453 if (next_box != nullptr) { | |
| 454 const double current_box_miss_metric = BoxMissMetric(blob_box, box); | |
| 455 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box); | |
| 456 if (applybox_debug > 2) { | |
| 457 tprintf("Checking blob:"); | |
| 458 blob_box.print(); | |
| 459 tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, | |
| 460 next_box_miss_metric); | |
| 461 } | |
| 462 if (current_box_miss_metric > next_box_miss_metric) { | |
| 463 continue; // Blob is a better match for next box. | |
| 464 } | |
| 465 } | |
| 466 if (applybox_debug > 2) { | |
| 467 tprintf("Blob match: blob:"); | |
| 468 blob_box.print(); | |
| 469 tprintf("Matches box:"); | |
| 470 box.print(); | |
| 471 if (next_box != nullptr) { | |
| 472 tprintf("With next box:"); | |
| 473 next_box->print(); | |
| 474 } | |
| 475 } | |
| 476 if (new_word == nullptr) { | |
| 477 // Make a new word with a single blob. | |
| 478 new_word = word->shallow_copy(); | |
| 479 new_word->set_text(correct_text); | |
| 480 w_it.add_to_end(new_word); | |
| 481 } | |
| 482 C_BLOB_IT new_blob_it(new_word->cblob_list()); | |
| 483 new_blob_it.add_to_end(blob_it.extract()); | |
| 484 } | |
| 485 } | |
| 486 } | |
| 487 } | |
| 488 if (new_word == nullptr && applybox_debug > 0) { | |
| 489 tprintf("FAIL!\n"); | |
| 490 } | |
| 491 return new_word != nullptr; | |
| 492 } | |
| 493 | |
| 494 /// Resegments the words by running the classifier in an attempt to find the | |
| 495 /// correct segmentation that produces the required string. | |
| 496 void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) { | |
| 497 PAGE_RES_IT pr_it(page_res); | |
| 498 WERD_RES *word_res; | |
| 499 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) { | |
| 500 const WERD *word = word_res->word; | |
| 501 if (word->text() == nullptr || word->text()[0] == '\0') { | |
| 502 continue; // Ignore words that have no text. | |
| 503 } | |
| 504 // Convert the correct text to a vector of UNICHAR_ID | |
| 505 std::vector<UNICHAR_ID> target_text; | |
| 506 if (!ConvertStringToUnichars(word->text(), &target_text)) { | |
| 507 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text()); | |
| 508 pr_it.DeleteCurrentWord(); | |
| 509 continue; | |
| 510 } | |
| 511 if (!FindSegmentation(target_text, word_res)) { | |
| 512 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text()); | |
| 513 pr_it.DeleteCurrentWord(); | |
| 514 continue; | |
| 515 } | |
| 516 } | |
| 517 } | |
| 518 | |
| 519 /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. | |
| 520 /// @return false if an invalid UNICHAR_ID is encountered. | |
| 521 bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) { | |
| 522 for (int step = 0; *utf8 != '\0'; utf8 += step) { | |
| 523 const char *next_space = strchr(utf8, ' '); | |
| 524 if (next_space == nullptr) { | |
| 525 next_space = utf8 + strlen(utf8); | |
| 526 } | |
| 527 step = next_space - utf8; | |
| 528 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step); | |
| 529 if (class_id == INVALID_UNICHAR_ID) { | |
| 530 return false; | |
| 531 } | |
| 532 while (utf8[step] == ' ') { | |
| 533 ++step; | |
| 534 } | |
| 535 class_ids->push_back(class_id); | |
| 536 } | |
| 537 return true; | |
| 538 } | |
| 539 | |
| 540 /// Resegments the word to achieve the target_text from the classifier. | |
| 541 /// Returns false if the re-segmentation fails. | |
| 542 /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and | |
| 543 /// applies a full search on the classifier results to find the best classified | |
| 544 /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity | |
| 545 /// substitutions ARE used. | |
| 546 bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) { | |
| 547 // Classify all required combinations of blobs and save results in choices. | |
| 548 const int word_length = word_res->box_word->length(); | |
| 549 auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length]; | |
| 550 for (int i = 0; i < word_length; ++i) { | |
| 551 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { | |
| 552 BLOB_CHOICE_LIST *match_result = | |
| 553 classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word, | |
| 554 word_res->blamer_bundle); | |
| 555 if (applybox_debug > 2) { | |
| 556 tprintf("%d+%d:", i, j); | |
| 557 print_ratings_list("Segment:", match_result, unicharset); | |
| 558 } | |
| 559 choices[i].push_back(match_result); | |
| 560 } | |
| 561 } | |
| 562 // Search the segmentation graph for the target text. Must be an exact | |
| 563 // match. Using wildcards makes it difficult to find the correct | |
| 564 // segmentation even when it is there. | |
| 565 word_res->best_state.clear(); | |
| 566 std::vector<int> search_segmentation; | |
| 567 float best_rating = 0.0f; | |
| 568 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, | |
| 569 &word_res->best_state); | |
| 570 for (int i = 0; i < word_length; ++i) { | |
| 571 for (auto choice : choices[i]) { | |
| 572 delete choice; | |
| 573 } | |
| 574 } | |
| 575 delete[] choices; | |
| 576 if (word_res->best_state.empty()) { | |
| 577 // Build the original segmentation and if it is the same length as the | |
| 578 // truth, assume it will do. | |
| 579 int blob_count = 1; | |
| 580 for (auto s : word_res->seam_array) { | |
| 581 SEAM *seam = s; | |
| 582 if (!seam->HasAnySplits()) { | |
| 583 word_res->best_state.push_back(blob_count); | |
| 584 blob_count = 1; | |
| 585 } else { | |
| 586 ++blob_count; | |
| 587 } | |
| 588 } | |
| 589 word_res->best_state.push_back(blob_count); | |
| 590 if (word_res->best_state.size() != target_text.size()) { | |
| 591 word_res->best_state.clear(); // No good. Original segmentation bad size. | |
| 592 return false; | |
| 593 } | |
| 594 } | |
| 595 word_res->correct_text.clear(); | |
| 596 for (auto &text : target_text) { | |
| 597 word_res->correct_text.emplace_back(unicharset.id_to_unichar(text)); | |
| 598 } | |
| 599 return true; | |
| 600 } | |
| 601 | |
| 602 /// Recursive helper to find a match to the target_text (from text_index | |
| 603 /// position) in the choices (from choices_pos position). | |
| 604 /// @param choices is an array of vectors of length choices_length, | |
| 605 /// with each element representing a starting position in the word, and the | |
| 606 /// #vector holding classification results for a sequence of consecutive | |
| 607 /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. | |
| 608 /// @param choices_pos | |
| 609 /// @param choices_length | |
| 610 /// @param target_text | |
| 611 /// @param text_index | |
| 612 /// @param rating | |
| 613 /// @param segmentation | |
| 614 /// @param best_rating | |
| 615 /// @param best_segmentation | |
| 616 void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos, | |
| 617 unsigned choices_length, const std::vector<UNICHAR_ID> &target_text, | |
| 618 unsigned text_index, float rating, std::vector<int> *segmentation, | |
| 619 float *best_rating, std::vector<int> *best_segmentation) { | |
| 620 const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs(); | |
| 621 for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) { | |
| 622 // Rating of matching choice or worst choice if no match. | |
| 623 float choice_rating = 0.0f; | |
| 624 // Find the corresponding best BLOB_CHOICE. | |
| 625 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); | |
| 626 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { | |
| 627 const BLOB_CHOICE *choice = choice_it.data(); | |
| 628 choice_rating = choice->rating(); | |
| 629 auto class_id = choice->unichar_id(); | |
| 630 if (class_id == target_text[text_index]) { | |
| 631 break; | |
| 632 } | |
| 633 // Search ambigs table. | |
| 634 if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) { | |
| 635 AmbigSpec_IT spec_it(table[class_id]); | |
| 636 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) { | |
| 637 const AmbigSpec *ambig_spec = spec_it.data(); | |
| 638 // We'll only do 1-1. | |
| 639 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && | |
| 640 ambig_spec->correct_ngram_id == target_text[text_index]) { | |
| 641 break; | |
| 642 } | |
| 643 } | |
| 644 if (!spec_it.cycled_list()) { | |
| 645 break; // Found an ambig. | |
| 646 } | |
| 647 } | |
| 648 } | |
| 649 if (choice_it.cycled_list()) { | |
| 650 continue; // No match. | |
| 651 } | |
| 652 segmentation->push_back(length); | |
| 653 if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) { | |
| 654 // This is a complete match. If the rating is good record a new best. | |
| 655 if (applybox_debug > 2) { | |
| 656 tesserr << "Complete match, rating = " << rating + choice_rating | |
| 657 << ", best=" << *best_rating | |
| 658 << ", seglength=" << segmentation->size() | |
| 659 << ", best=" << best_segmentation->size() << '\n'; | |
| 660 } | |
| 661 if (best_segmentation->empty() || rating + choice_rating < *best_rating) { | |
| 662 *best_segmentation = *segmentation; | |
| 663 *best_rating = rating + choice_rating; | |
| 664 } | |
| 665 } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) { | |
| 666 if (applybox_debug > 3) { | |
| 667 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index], | |
| 668 unicharset.id_to_unichar(target_text[text_index]), | |
| 669 choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig", | |
| 670 choices_pos, length); | |
| 671 } | |
| 672 SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1, | |
| 673 rating + choice_rating, segmentation, best_rating, best_segmentation); | |
| 674 if (applybox_debug > 3) { | |
| 675 tprintf("End recursion for %d=%s\n", target_text[text_index], | |
| 676 unicharset.id_to_unichar(target_text[text_index])); | |
| 677 } | |
| 678 } | |
| 679 segmentation->resize(segmentation->size() - 1); | |
| 680 } | |
| 681 } | |
| 682 | |
| 683 /// - Counts up the labelled words and the blobs within. | |
| 684 /// - Deletes all unused or emptied words, counting the unused ones. | |
| 685 /// - Resets W_BOL and W_EOL flags correctly. | |
| 686 /// - Builds the rebuild_word and rebuilds the box_word and the best_choice. | |
| 687 void Tesseract::TidyUp(PAGE_RES *page_res) { | |
| 688 int ok_blob_count = 0; | |
| 689 int bad_blob_count = 0; | |
| 690 // TODO: check usage of ok_word_count. | |
| 691 int ok_word_count = 0; | |
| 692 int unlabelled_words = 0; | |
| 693 PAGE_RES_IT pr_it(page_res); | |
| 694 WERD_RES *word_res; | |
| 695 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) { | |
| 696 int ok_in_word = 0; | |
| 697 int blob_count = word_res->correct_text.size(); | |
| 698 auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count); | |
| 699 word_choice->set_permuter(TOP_CHOICE_PERM); | |
| 700 for (int c = 0; c < blob_count; ++c) { | |
| 701 if (word_res->correct_text[c].length() > 0) { | |
| 702 ++ok_in_word; | |
| 703 } | |
| 704 // Since we only need a fake word_res->best_choice, the actual | |
| 705 // unichar_ids do not matter. Which is fortunate, since TidyUp() | |
| 706 // can be called while training Tesseract, at the stage where | |
| 707 // unicharset is not meaningful yet. | |
| 708 word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c], | |
| 709 1.0f, -1.0f); | |
| 710 } | |
| 711 if (ok_in_word > 0) { | |
| 712 ok_blob_count += ok_in_word; | |
| 713 bad_blob_count += word_res->correct_text.size() - ok_in_word; | |
| 714 word_res->LogNewRawChoice(word_choice); | |
| 715 word_res->LogNewCookedChoice(1, false, word_choice); | |
| 716 } else { | |
| 717 ++unlabelled_words; | |
| 718 if (applybox_debug > 0) { | |
| 719 tprintf("APPLY_BOXES: Unlabelled word at :"); | |
| 720 word_res->word->bounding_box().print(); | |
| 721 } | |
| 722 pr_it.DeleteCurrentWord(); | |
| 723 delete word_choice; | |
| 724 } | |
| 725 } | |
| 726 pr_it.restart_page(); | |
| 727 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) { | |
| 728 // Denormalize back to a BoxWord. | |
| 729 word_res->RebuildBestState(); | |
| 730 word_res->SetupBoxWord(); | |
| 731 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row()); | |
| 732 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); | |
| 733 } | |
| 734 if (applybox_debug > 0) { | |
| 735 tprintf(" Found %d good blobs.\n", ok_blob_count); | |
| 736 if (bad_blob_count > 0) { | |
| 737 tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count); | |
| 738 } | |
| 739 if (unlabelled_words > 0) { | |
| 740 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); | |
| 741 } | |
| 742 } | |
| 743 } | |
| 744 | |
| 745 /** Logs a bad box by line in the box file and box coords.*/ | |
| 746 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, | |
| 747 const char *err_msg) { | |
| 748 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch, | |
| 749 box.left(), box.bottom(), box.right(), box.top(), err_msg); | |
| 750 } | |
| 751 | |
| 752 /// Calls #LearnWord to extract features for labelled blobs within each word. | |
| 753 /// Features are stored in an internal buffer. | |
| 754 void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) { | |
| 755 PAGE_RES_IT pr_it(page_res); | |
| 756 int word_count = 0; | |
| 757 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) { | |
| 758 LearnWord(fontname.c_str(), word_res); | |
| 759 ++word_count; | |
| 760 } | |
| 761 tprintf("Generated training data for %d words\n", word_count); | |
| 762 } | |
| 763 | |
| 764 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 765 | |
| 766 /** Creates a fake best_choice entry in each WERD_RES with the correct text.*/ | |
| 767 void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) { | |
| 768 PAGE_RES_IT pr_it(page_res); | |
| 769 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) { | |
| 770 auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size()); | |
| 771 for (auto &correct_text : word_res->correct_text) { | |
| 772 // The part before the first space is the real ground truth, and the | |
| 773 // rest is the bounding box location and page number. | |
| 774 std::vector<std::string> tokens = split(correct_text, ' '); | |
| 775 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str()); | |
| 776 choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f); | |
| 777 } | |
| 778 word_res->ClearWordChoices(); | |
| 779 word_res->LogNewRawChoice(choice); | |
| 780 word_res->LogNewCookedChoice(1, false, choice); | |
| 781 } | |
| 782 } | |
| 783 | |
| 784 } // namespace tesseract |
