comparison mupdf-source/thirdparty/tesseract/src/ccmain/applybox.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: applybox.cpp (Formerly applybox.c)
3 * Description: Re segment rows according to box file data
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1993, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #ifndef DISABLED_LEGACY_ENGINE
20 # include <allheaders.h>
21 # include <cctype>
22 # include <cerrno>
23 # include <cstring>
24 # include "boxread.h"
25 #endif // ndef DISABLED_LEGACY_ENGINE
26 #include <tesseract/unichar.h>
27 #include "pageres.h"
28 #include "tesseractclass.h"
29 #include "tesserrstream.h" // for tesserr
30 #include "unicharset.h"
31
32 #ifndef DISABLED_LEGACY_ENGINE
33 /** Max number of blobs to classify together in FindSegmentation. */
34 const int kMaxGroupSize = 4;
35 /// Max fraction of median allowed as deviation in xheight before switching
36 /// to median.
37 const double kMaxXHeightDeviationFraction = 0.125;
38 #endif // ndef DISABLED_LEGACY_ENGINE
39
40 /**
41 * The box file is assumed to contain box definitions, one per line, of the
42 * following format for blob-level boxes:
43 * @verbatim
44 * <UTF8 str> <left> <bottom> <right> <top> <page id>
45 * @endverbatim
46 * and for word/line-level boxes:
47 * @verbatim
48 * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
49 * @endverbatim
50 * NOTES:
51 * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
52 *
53 * <page id> is 0-based, and the page number is used for multipage input (tiff).
54 *
55 * In the blob-level form, each line represents a recognizable unit, which may
56 * be several UTF-8 bytes, but there is a bounding box around each recognizable
57 * unit, and no classifier is needed to train in this mode (bootstrapping.)
58 *
59 * In the word/line-level form, the line begins with the literal "WordStr", and
60 * the bounding box bounds either a whole line or a whole word. The recognizable
61 * units in the word/line are listed after the # at the end of the line and
62 * are space delimited, ignoring any original spaces on the line.
63 * Eg.
64 * @verbatim
65 * word -> #w o r d
66 * multi word line -> #m u l t i w o r d l i n e
67 * @endverbatim
68 * The recognizable units must be space-delimited in order to allow multiple
69 * unicodes to be used for a single recognizable unit, eg Hindi.
70 *
71 * In this mode, the classifier must have been pre-trained with the desired
72 * character set, or it will not be able to find the character segmentations.
73 */
74
75 namespace tesseract {
76
77 #ifndef DISABLED_LEGACY_ENGINE
78 static void clear_any_old_text(BLOCK_LIST *block_list) {
79 BLOCK_IT block_it(block_list);
80 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
81 ROW_IT row_it(block_it.data()->row_list());
82 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
83 WERD_IT word_it(row_it.data()->word_list());
84 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
85 word_it.data()->set_text("");
86 }
87 }
88 }
89 }
90
91 // Applies the box file based on the image name filename, and resegments
92 // the words in the block_list (page), with:
93 // blob-mode: one blob per line in the box file, words as input.
94 // word/line-mode: one blob per space-delimited unit after the #, and one word
95 // per line in the box file. (See comment above for box file format.)
96 // If find_segmentation is true, (word/line mode) then the classifier is used
97 // to re-segment words/lines to match the space-delimited truth string for
98 // each box. In this case, the input box may be for a word or even a whole
99 // text line, and the output words will contain multiple blobs corresponding
100 // to the space-delimited input string.
101 // With find_segmentation false, no classifier is needed, but the chopper
102 // can still be used to correctly segment touching characters with the help
103 // of the input boxes.
104 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
105 // from normal classification, ie. with a word, chopped_word, rebuild_word,
106 // seam_array, denorm, box_word, and best_state, but NO best_choice or
107 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
108 // Instead, the correct_text member of WERD_RES is set, and this may be later
109 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
110 // is not required before calling ApplyBoxTraining.
111 PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
112 BLOCK_LIST *block_list) {
113 std::vector<TBOX> boxes;
114 std::vector<std::string> texts, full_texts;
115 if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
116 return nullptr; // Can't do it.
117 }
118
119 const int box_count = boxes.size();
120 int box_failures = 0;
121
122 // In word mode, we use the boxes to make a word for each box, but
123 // in blob mode we use the existing words and maximally chop them first.
124 PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
125 clear_any_old_text(block_list);
126
127 for (int i = 0; i < box_count; i++) {
128 bool foundit = false;
129 if (page_res != nullptr) {
130 foundit =
131 ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
132 (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
133 } else {
134 foundit = ResegmentWordBox(block_list, boxes[i],
135 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
136 }
137 if (!foundit) {
138 box_failures++;
139 ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
140 }
141 }
142
143 if (page_res == nullptr) {
144 // In word/line mode, we now maximally chop all the words and resegment
145 // them with the classifier.
146 page_res = SetupApplyBoxes(boxes, block_list);
147 ReSegmentByClassification(page_res);
148 }
149 if (applybox_debug > 0) {
150 tprintf("APPLY_BOXES:\n");
151 tprintf(" Boxes read from boxfile: %6d\n", box_count);
152 if (box_failures > 0) {
153 tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
154 }
155 }
156 TidyUp(page_res);
157 return page_res;
158 }
159
160 // Helper computes median xheight in the image.
161 static double MedianXHeight(BLOCK_LIST *block_list) {
162 BLOCK_IT block_it(block_list);
163 STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1);
164 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
165 ROW_IT row_it(block_it.data()->row_list());
166 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
167 xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
168 }
169 }
170 return xheights.median();
171 }
172
173 /// Any row xheight that is significantly different from the median is set
174 /// to the median.
175 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
176 const double median_xheight = MedianXHeight(block_list);
177 const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
178 // Strip all fuzzy space markers to simplify the PAGE_RES.
179 BLOCK_IT b_it(block_list);
180 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
181 BLOCK *block = b_it.data();
182 ROW_IT r_it(block->row_list());
183 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
184 ROW *row = r_it.data();
185 const double diff = fabs(row->x_height() - median_xheight);
186 if (diff > max_deviation) {
187 if (applybox_debug) {
188 tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
189 }
190 row->set_x_height(static_cast<float>(median_xheight));
191 }
192 }
193 }
194 }
195
196 /// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
197 /// All fuzzy spaces are removed, and all the words are maximally chopped.
198 PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
199 PreenXHeights(block_list);
200 // Strip all fuzzy space markers to simplify the PAGE_RES.
201 BLOCK_IT b_it(block_list);
202 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
203 BLOCK *block = b_it.data();
204 ROW_IT r_it(block->row_list());
205 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
206 ROW *row = r_it.data();
207 WERD_IT w_it(row->word_list());
208 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
209 WERD *word = w_it.data();
210 if (word->cblob_list()->empty()) {
211 delete w_it.extract();
212 } else {
213 word->set_flag(W_FUZZY_SP, false);
214 word->set_flag(W_FUZZY_NON, false);
215 }
216 }
217 }
218 }
219 auto *page_res = new PAGE_RES(false, block_list, nullptr);
220 PAGE_RES_IT pr_it(page_res);
221 WERD_RES *word_res;
222 while ((word_res = pr_it.word()) != nullptr) {
223 MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
224 pr_it.forward();
225 }
226 return page_res;
227 }
228
229 /// Tests the chopper by exhaustively running chop_one_blob.
230 /// The word_res will contain filled chopped_word, seam_array, denorm,
231 /// box_word and best_state for the maximally chopped word.
232 void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
233 WERD_RES *word_res) {
234 if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
235 classify_bln_numeric_mode, textord_use_cjk_fp_model,
236 poly_allow_detailed_fx, row, block)) {
237 word_res->CloneChoppedToRebuild();
238 return;
239 }
240 if (chop_debug) {
241 tprintf("Maximally chopping word at:");
242 word_res->word->bounding_box().print();
243 }
244 std::vector<BLOB_CHOICE *> blob_choices;
245 ASSERT_HOST(!word_res->chopped_word->blobs.empty());
246 auto rating = static_cast<float>(INT8_MAX);
247 for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
248 // The rating and certainty are not quite arbitrary. Since
249 // select_blob_to_chop uses the worst certainty to choose, they all have
250 // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
251 // in here, and then divide by e each time they are chopped, which
252 // should guarantee a set of unequal values for the whole tree of blobs
253 // produced, however much chopping is required. The chops are thus only
254 // limited by the ability of the chopper to find suitable chop points,
255 // and not by the value of the certainties.
256 auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
257 blob_choices.push_back(choice);
258 rating -= 0.125f;
259 }
260 const double e = exp(1.0); // The base of natural logs.
261 unsigned blob_number;
262 if (!assume_fixed_pitch_char_segment) {
263 // We only chop if the language is not fixed pitch like CJK.
264 SEAM *seam = nullptr;
265 int right_chop_index = 0;
266 while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
267 word_res->InsertSeam(blob_number, seam);
268 BLOB_CHOICE *left_choice = blob_choices[blob_number];
269 rating = left_choice->rating() / e;
270 left_choice->set_rating(rating);
271 left_choice->set_certainty(-rating);
272 // combine confidence w/ serial #
273 auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
274 0.0f, 0.0f, BCC_FAKE);
275 blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
276 }
277 }
278 word_res->CloneChoppedToRebuild();
279 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
280 }
281
282 /// Helper to compute the dispute resolution metric.
283 /// Disputed blob resolution. The aim is to give the blob to the most
284 /// appropriate boxfile box. Most of the time it is obvious, but if
285 /// two boxfile boxes overlap significantly it is not. If a small boxfile
286 /// box takes most of the blob, and a large boxfile box does too, then
287 /// we want the small boxfile box to get it, but if the small box
288 /// is much smaller than the blob, we don't want it to get it.
289 /// Details of the disputed blob resolution:
290 /// Given a box with area A, and a blob with area B, with overlap area C,
291 /// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
292 /// miss metric gets the blob.
293 static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
294 const int overlap_area = box1.intersection(box2).area();
295 const int a = box1.area();
296 const int b = box2.area();
297 ASSERT_HOST(a != 0 && b != 0);
298 return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
299 }
300
301 /// Gather consecutive blobs that match the given box into the best_state
302 /// and corresponding correct_text.
303 ///
304 /// Fights over which box owns which blobs are settled by pre-chopping and
305 /// applying the blobs to box or next_box with the least non-overlap.
306 /// @return false if the box was in error, which can only be caused by
307 /// failing to find an appropriate blob for a box.
308 ///
309 /// This means that occasionally, blobs may be incorrectly segmented if the
310 /// chopper fails to find a suitable chop point.
311 bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
312 const TBOX *next_box, const char *correct_text) {
313 if (applybox_debug > 1) {
314 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
315 }
316 PAGE_RES_IT page_res_it(page_res);
317 WERD_RES *word_res;
318 for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
319 if (!word_res->box_word->bounding_box().major_overlap(box)) {
320 continue;
321 }
322 if (applybox_debug > 1) {
323 tprintf("Checking word box:");
324 word_res->box_word->bounding_box().print();
325 }
326 int word_len = word_res->box_word->length();
327 for (int i = 0; i < word_len; ++i) {
328 TBOX char_box = TBOX();
329 int blob_count = 0;
330 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
331 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
332 if (!blob_box.major_overlap(box)) {
333 break;
334 }
335 if (word_res->correct_text[i + blob_count].length() > 0) {
336 break; // Blob is claimed already.
337 }
338 if (next_box != nullptr) {
339 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
340 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
341 if (applybox_debug > 2) {
342 tprintf("Checking blob:");
343 blob_box.print();
344 tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
345 next_box_miss_metric);
346 }
347 if (current_box_miss_metric > next_box_miss_metric) {
348 break; // Blob is a better match for next box.
349 }
350 }
351 char_box += blob_box;
352 }
353 if (blob_count > 0) {
354 if (applybox_debug > 1) {
355 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
356 }
357 if (!char_box.almost_equal(box, 3) &&
358 ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
359 (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
360 return false;
361 }
362 // We refine just the box_word, best_state and correct_text here.
363 // The rebuild_word is made in TidyUp.
364 // blob_count blobs are put together to match the box. Merge the
365 // box_word boxes, save the blob_count in the state and the text.
366 word_res->box_word->MergeBoxes(i, i + blob_count);
367 word_res->best_state[i] = blob_count;
368 word_res->correct_text[i] = correct_text;
369 if (applybox_debug > 2) {
370 tprintf("%d Blobs match: blob box:", blob_count);
371 word_res->box_word->BlobBox(i).print();
372 tprintf("Matches box:");
373 box.print();
374 if (next_box != nullptr) {
375 tprintf("With next box:");
376 next_box->print();
377 }
378 }
379 // Eliminated best_state and correct_text entries for the consumed
380 // blobs.
381 for (int j = 1; j < blob_count; ++j) {
382 word_res->best_state.erase(word_res->best_state.begin() + i + 1);
383 word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
384 }
385 // Assume that no box spans multiple source words, so we are done with
386 // this box.
387 if (applybox_debug > 1) {
388 tprintf("Best state = ");
389 for (auto best_state : word_res->best_state) {
390 tprintf("%d ", best_state);
391 }
392 tprintf("\n");
393 tprintf("Correct text = [[ ");
394 for (auto &it : word_res->correct_text) {
395 tprintf("%s ", it.c_str());
396 }
397 tprintf("]]\n");
398 }
399 return true;
400 }
401 }
402 }
403 if (applybox_debug > 0) {
404 tprintf("FAIL!\n");
405 }
406 return false; // Failure.
407 }
408
409 /// Consume all source blobs that strongly overlap the given box,
410 /// putting them into a new word, with the correct_text label.
411 /// Fights over which box owns which blobs are settled by
412 /// applying the blobs to box or next_box with the least non-overlap.
413 /// @return false if the box was in error, which can only be caused by
414 /// failing to find an overlapping blob for a box.
415 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
416 const char *correct_text) {
417 if (applybox_debug > 1) {
418 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
419 }
420 WERD *new_word = nullptr;
421 BLOCK_IT b_it(block_list);
422 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
423 BLOCK *block = b_it.data();
424 if (!box.major_overlap(block->pdblk.bounding_box())) {
425 continue;
426 }
427 ROW_IT r_it(block->row_list());
428 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
429 ROW *row = r_it.data();
430 if (!box.major_overlap(row->bounding_box())) {
431 continue;
432 }
433 WERD_IT w_it(row->word_list());
434 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
435 WERD *word = w_it.data();
436 if (applybox_debug > 2) {
437 tprintf("Checking word:");
438 word->bounding_box().print();
439 }
440 if (word->text() != nullptr && word->text()[0] != '\0') {
441 continue; // Ignore words that are already done.
442 }
443 if (!box.major_overlap(word->bounding_box())) {
444 continue;
445 }
446 C_BLOB_IT blob_it(word->cblob_list());
447 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
448 C_BLOB *blob = blob_it.data();
449 TBOX blob_box = blob->bounding_box();
450 if (!blob_box.major_overlap(box)) {
451 continue;
452 }
453 if (next_box != nullptr) {
454 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
455 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
456 if (applybox_debug > 2) {
457 tprintf("Checking blob:");
458 blob_box.print();
459 tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
460 next_box_miss_metric);
461 }
462 if (current_box_miss_metric > next_box_miss_metric) {
463 continue; // Blob is a better match for next box.
464 }
465 }
466 if (applybox_debug > 2) {
467 tprintf("Blob match: blob:");
468 blob_box.print();
469 tprintf("Matches box:");
470 box.print();
471 if (next_box != nullptr) {
472 tprintf("With next box:");
473 next_box->print();
474 }
475 }
476 if (new_word == nullptr) {
477 // Make a new word with a single blob.
478 new_word = word->shallow_copy();
479 new_word->set_text(correct_text);
480 w_it.add_to_end(new_word);
481 }
482 C_BLOB_IT new_blob_it(new_word->cblob_list());
483 new_blob_it.add_to_end(blob_it.extract());
484 }
485 }
486 }
487 }
488 if (new_word == nullptr && applybox_debug > 0) {
489 tprintf("FAIL!\n");
490 }
491 return new_word != nullptr;
492 }
493
494 /// Resegments the words by running the classifier in an attempt to find the
495 /// correct segmentation that produces the required string.
496 void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
497 PAGE_RES_IT pr_it(page_res);
498 WERD_RES *word_res;
499 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
500 const WERD *word = word_res->word;
501 if (word->text() == nullptr || word->text()[0] == '\0') {
502 continue; // Ignore words that have no text.
503 }
504 // Convert the correct text to a vector of UNICHAR_ID
505 std::vector<UNICHAR_ID> target_text;
506 if (!ConvertStringToUnichars(word->text(), &target_text)) {
507 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
508 pr_it.DeleteCurrentWord();
509 continue;
510 }
511 if (!FindSegmentation(target_text, word_res)) {
512 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
513 pr_it.DeleteCurrentWord();
514 continue;
515 }
516 }
517 }
518
519 /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
520 /// @return false if an invalid UNICHAR_ID is encountered.
521 bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
522 for (int step = 0; *utf8 != '\0'; utf8 += step) {
523 const char *next_space = strchr(utf8, ' ');
524 if (next_space == nullptr) {
525 next_space = utf8 + strlen(utf8);
526 }
527 step = next_space - utf8;
528 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
529 if (class_id == INVALID_UNICHAR_ID) {
530 return false;
531 }
532 while (utf8[step] == ' ') {
533 ++step;
534 }
535 class_ids->push_back(class_id);
536 }
537 return true;
538 }
539
540 /// Resegments the word to achieve the target_text from the classifier.
541 /// Returns false if the re-segmentation fails.
542 /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
543 /// applies a full search on the classifier results to find the best classified
544 /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
545 /// substitutions ARE used.
546 bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
547 // Classify all required combinations of blobs and save results in choices.
548 const int word_length = word_res->box_word->length();
549 auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
550 for (int i = 0; i < word_length; ++i) {
551 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
552 BLOB_CHOICE_LIST *match_result =
553 classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
554 word_res->blamer_bundle);
555 if (applybox_debug > 2) {
556 tprintf("%d+%d:", i, j);
557 print_ratings_list("Segment:", match_result, unicharset);
558 }
559 choices[i].push_back(match_result);
560 }
561 }
562 // Search the segmentation graph for the target text. Must be an exact
563 // match. Using wildcards makes it difficult to find the correct
564 // segmentation even when it is there.
565 word_res->best_state.clear();
566 std::vector<int> search_segmentation;
567 float best_rating = 0.0f;
568 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
569 &word_res->best_state);
570 for (int i = 0; i < word_length; ++i) {
571 for (auto choice : choices[i]) {
572 delete choice;
573 }
574 }
575 delete[] choices;
576 if (word_res->best_state.empty()) {
577 // Build the original segmentation and if it is the same length as the
578 // truth, assume it will do.
579 int blob_count = 1;
580 for (auto s : word_res->seam_array) {
581 SEAM *seam = s;
582 if (!seam->HasAnySplits()) {
583 word_res->best_state.push_back(blob_count);
584 blob_count = 1;
585 } else {
586 ++blob_count;
587 }
588 }
589 word_res->best_state.push_back(blob_count);
590 if (word_res->best_state.size() != target_text.size()) {
591 word_res->best_state.clear(); // No good. Original segmentation bad size.
592 return false;
593 }
594 }
595 word_res->correct_text.clear();
596 for (auto &text : target_text) {
597 word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
598 }
599 return true;
600 }
601
602 /// Recursive helper to find a match to the target_text (from text_index
603 /// position) in the choices (from choices_pos position).
604 /// @param choices is an array of vectors of length choices_length,
605 /// with each element representing a starting position in the word, and the
606 /// #vector holding classification results for a sequence of consecutive
607 /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
608 /// @param choices_pos
609 /// @param choices_length
610 /// @param target_text
611 /// @param text_index
612 /// @param rating
613 /// @param segmentation
614 /// @param best_rating
615 /// @param best_segmentation
616 void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
617 unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
618 unsigned text_index, float rating, std::vector<int> *segmentation,
619 float *best_rating, std::vector<int> *best_segmentation) {
620 const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
621 for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
622 // Rating of matching choice or worst choice if no match.
623 float choice_rating = 0.0f;
624 // Find the corresponding best BLOB_CHOICE.
625 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
626 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
627 const BLOB_CHOICE *choice = choice_it.data();
628 choice_rating = choice->rating();
629 auto class_id = choice->unichar_id();
630 if (class_id == target_text[text_index]) {
631 break;
632 }
633 // Search ambigs table.
634 if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
635 AmbigSpec_IT spec_it(table[class_id]);
636 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
637 const AmbigSpec *ambig_spec = spec_it.data();
638 // We'll only do 1-1.
639 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
640 ambig_spec->correct_ngram_id == target_text[text_index]) {
641 break;
642 }
643 }
644 if (!spec_it.cycled_list()) {
645 break; // Found an ambig.
646 }
647 }
648 }
649 if (choice_it.cycled_list()) {
650 continue; // No match.
651 }
652 segmentation->push_back(length);
653 if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
654 // This is a complete match. If the rating is good record a new best.
655 if (applybox_debug > 2) {
656 tesserr << "Complete match, rating = " << rating + choice_rating
657 << ", best=" << *best_rating
658 << ", seglength=" << segmentation->size()
659 << ", best=" << best_segmentation->size() << '\n';
660 }
661 if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
662 *best_segmentation = *segmentation;
663 *best_rating = rating + choice_rating;
664 }
665 } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
666 if (applybox_debug > 3) {
667 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
668 unicharset.id_to_unichar(target_text[text_index]),
669 choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
670 choices_pos, length);
671 }
672 SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
673 rating + choice_rating, segmentation, best_rating, best_segmentation);
674 if (applybox_debug > 3) {
675 tprintf("End recursion for %d=%s\n", target_text[text_index],
676 unicharset.id_to_unichar(target_text[text_index]));
677 }
678 }
679 segmentation->resize(segmentation->size() - 1);
680 }
681 }
682
683 /// - Counts up the labelled words and the blobs within.
684 /// - Deletes all unused or emptied words, counting the unused ones.
685 /// - Resets W_BOL and W_EOL flags correctly.
686 /// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
687 void Tesseract::TidyUp(PAGE_RES *page_res) {
688 int ok_blob_count = 0;
689 int bad_blob_count = 0;
690 // TODO: check usage of ok_word_count.
691 int ok_word_count = 0;
692 int unlabelled_words = 0;
693 PAGE_RES_IT pr_it(page_res);
694 WERD_RES *word_res;
695 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
696 int ok_in_word = 0;
697 int blob_count = word_res->correct_text.size();
698 auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
699 word_choice->set_permuter(TOP_CHOICE_PERM);
700 for (int c = 0; c < blob_count; ++c) {
701 if (word_res->correct_text[c].length() > 0) {
702 ++ok_in_word;
703 }
704 // Since we only need a fake word_res->best_choice, the actual
705 // unichar_ids do not matter. Which is fortunate, since TidyUp()
706 // can be called while training Tesseract, at the stage where
707 // unicharset is not meaningful yet.
708 word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
709 1.0f, -1.0f);
710 }
711 if (ok_in_word > 0) {
712 ok_blob_count += ok_in_word;
713 bad_blob_count += word_res->correct_text.size() - ok_in_word;
714 word_res->LogNewRawChoice(word_choice);
715 word_res->LogNewCookedChoice(1, false, word_choice);
716 } else {
717 ++unlabelled_words;
718 if (applybox_debug > 0) {
719 tprintf("APPLY_BOXES: Unlabelled word at :");
720 word_res->word->bounding_box().print();
721 }
722 pr_it.DeleteCurrentWord();
723 delete word_choice;
724 }
725 }
726 pr_it.restart_page();
727 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
728 // Denormalize back to a BoxWord.
729 word_res->RebuildBestState();
730 word_res->SetupBoxWord();
731 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
732 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
733 }
734 if (applybox_debug > 0) {
735 tprintf(" Found %d good blobs.\n", ok_blob_count);
736 if (bad_blob_count > 0) {
737 tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
738 }
739 if (unlabelled_words > 0) {
740 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
741 }
742 }
743 }
744
745 /** Logs a bad box by line in the box file and box coords.*/
746 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
747 const char *err_msg) {
748 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
749 box.left(), box.bottom(), box.right(), box.top(), err_msg);
750 }
751
752 /// Calls #LearnWord to extract features for labelled blobs within each word.
753 /// Features are stored in an internal buffer.
754 void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
755 PAGE_RES_IT pr_it(page_res);
756 int word_count = 0;
757 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
758 LearnWord(fontname.c_str(), word_res);
759 ++word_count;
760 }
761 tprintf("Generated training data for %d words\n", word_count);
762 }
763
764 #endif // ndef DISABLED_LEGACY_ENGINE
765
766 /** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
767 void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
768 PAGE_RES_IT pr_it(page_res);
769 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
770 auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
771 for (auto &correct_text : word_res->correct_text) {
772 // The part before the first space is the real ground truth, and the
773 // rest is the bounding box location and page number.
774 std::vector<std::string> tokens = split(correct_text, ' ');
775 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
776 choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
777 }
778 word_res->ClearWordChoices();
779 word_res->LogNewRawChoice(choice);
780 word_res->LogNewCookedChoice(1, false, choice);
781 }
782 }
783
784 } // namespace tesseract