comparison mupdf-source/thirdparty/tesseract/src/ccmain/resultiterator.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 //
8 // (C) Copyright 2011, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20
21 #include <tesseract/resultiterator.h>
22
23 #include "helpers.h" // for copy_string
24 #include "pageres.h"
25 #include "tesseractclass.h"
26 #include "unicharset.h"
27
28 #include <allheaders.h>
29
30 #include <set>
31 #include <vector>
32
33 static const char *const kLRM = "\u200E"; // Left-to-Right Mark
34 static const char *const kRLM = "\u200F"; // Right-to-Left Mark
35
36 namespace tesseract {
37
38 ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) {
39 in_minor_direction_ = false;
40 at_beginning_of_minor_run_ = false;
41 preserve_interword_spaces_ = false;
42
43 auto *p = ParamUtils::FindParam<BoolParam>(
44 "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
45 if (p != nullptr) {
46 preserve_interword_spaces_ = (bool)(*p);
47 }
48
49 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
50 MoveToLogicalStartOfTextline();
51 }
52
53 ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) {
54 return new ResultIterator(resit);
55 }
56
57 bool ResultIterator::ParagraphIsLtr() const {
58 return current_paragraph_is_ltr_;
59 }
60
61 bool ResultIterator::CurrentParagraphIsLtr() const {
62 if (!it_->word()) {
63 return true; // doesn't matter.
64 }
65 LTRResultIterator it(*this);
66 it.RestartParagraph();
67 // Try to figure out the ltr-ness of the paragraph. The rules below
68 // make more sense in the context of a difficult paragraph example.
69 // Here we denote {ltr characters, RTL CHARACTERS}:
70 //
71 // "don't go in there!" DAIS EH
72 // EHT OTNI DEPMUJ FELSMIH NEHT DNA
73 // .GNIDLIUB GNINRUB
74 //
75 // On the first line, the left-most word is LTR and the rightmost word
76 // is RTL. Thus, we are better off taking the majority direction for
77 // the whole paragraph contents. So instead of "the leftmost word is LTR"
78 // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
79 // would not do: Typically an RTL paragraph would *not* start with an LTR
80 // word. So our heuristics are as follows:
81 //
82 // (1) If the first text line has an RTL word in the left-most position
83 // it is RTL.
84 // (2) If the first text line has an LTR word in the right-most position
85 // it is LTR.
86 // (3) If neither of the above is true, take the majority count for the
87 // paragraph -- if there are more rtl words, it is RTL. If there
88 // are more LTR words, it's LTR.
89 bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
90 bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
91 int num_ltr, num_rtl;
92 num_rtl = leftmost_rtl ? 1 : 0;
93 num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
94 for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
95 it.Next(RIL_WORD)) {
96 StrongScriptDirection dir = it.WordDirection();
97 rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
98 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
99 num_ltr += rightmost_ltr ? 1 : 0;
100 }
101 if (leftmost_rtl) {
102 return false;
103 }
104 if (rightmost_ltr) {
105 return true;
106 }
107 // First line is ambiguous. Take statistics on the whole paragraph.
108 if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {
109 do {
110 StrongScriptDirection dir = it.WordDirection();
111 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
112 num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
113 } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
114 }
115 return num_ltr >= num_rtl;
116 }
117
118 const int ResultIterator::kMinorRunStart = -1;
119 const int ResultIterator::kMinorRunEnd = -2;
120 const int ResultIterator::kComplexWord = -3;
121
122 void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
123 bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
124 blob_indices->clear();
125 if (Empty(RIL_WORD)) {
126 return;
127 }
128 if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
129 // Easy! just return the blobs in order;
130 for (int i = 0; i < word_length_; i++) {
131 blob_indices->push_back(i);
132 }
133 return;
134 }
135
136 // The blobs are in left-to-right order, but the current reading context
137 // is right-to-left.
138 const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
139 const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
140 const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
141 const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
142 const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
143 const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
144 const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
145
146 // Step 1: Scan for and mark European Number sequences
147 // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
148 std::vector<int> letter_types;
149 letter_types.reserve(word_length_);
150 for (int i = 0; i < word_length_; i++) {
151 letter_types.push_back(it_->word()->SymbolDirection(i));
152 }
153 // Convert a single separator sandwiched between two ENs into an EN.
154 for (int i = 0; i + 2 < word_length_; i++) {
155 if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
156 (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {
157 letter_types[i + 1] = U_EURO_NUM;
158 }
159 }
160 // Scan for sequences of European Number Terminators around ENs and convert
161 // them to ENs.
162 for (int i = 0; i < word_length_; i++) {
163 if (letter_types[i] == U_EURO_NUM_TERM) {
164 int j = i + 1;
165 while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
166 j++;
167 }
168 if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
169 // The sequence [i..j] should be converted to all European Numbers.
170 for (int k = i; k < j; k++) {
171 letter_types[k] = U_EURO_NUM;
172 }
173 }
174 j = i - 1;
175 while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
176 j--;
177 }
178 if (j > -1 && letter_types[j] == U_EURO_NUM) {
179 // The sequence [j..i] should be converted to all European Numbers.
180 for (int k = j; k <= i; k++) {
181 letter_types[k] = U_EURO_NUM;
182 }
183 }
184 }
185 }
186 // Step 2: Convert all remaining types to either L or R.
187 // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
188 // All other are R.
189 for (int i = 0; i < word_length_;) {
190 int ti = letter_types[i];
191 if (ti == U_LTR || ti == U_EURO_NUM) {
192 // Left to right sequence; scan to the end of it.
193 int last_good = i;
194 for (int j = i + 1; j < word_length_; j++) {
195 int tj = letter_types[j];
196 if (tj == U_LTR || tj == U_EURO_NUM) {
197 last_good = j;
198 } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
199 // do nothing.
200 } else {
201 break;
202 }
203 }
204 // [i..last_good] is the L sequence
205 for (int k = i; k <= last_good; k++) {
206 letter_types[k] = U_LTR;
207 }
208 i = last_good + 1;
209 } else {
210 letter_types[i] = U_RTL;
211 i++;
212 }
213 }
214
215 // At this point, letter_types is entirely U_LTR or U_RTL.
216 for (int i = word_length_ - 1; i >= 0;) {
217 if (letter_types[i] == U_RTL) {
218 blob_indices->push_back(i);
219 i--;
220 } else {
221 // left to right sequence. scan to the beginning.
222 int j = i - 1;
223 for (; j >= 0 && letter_types[j] != U_RTL; j--) {
224 } // pass
225 // Now (j, i] is LTR
226 for (int k = j + 1; k <= i; k++) {
227 blob_indices->push_back(k);
228 }
229 i = j;
230 }
231 }
232 ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_));
233 }
234
235 static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
236 for (auto dir : dirs) {
237 switch (dir) {
238 case DIR_NEUTRAL:
239 tprintf("N ");
240 break;
241 case DIR_LEFT_TO_RIGHT:
242 tprintf("L ");
243 break;
244 case DIR_RIGHT_TO_LEFT:
245 tprintf("R ");
246 break;
247 case DIR_MIX:
248 tprintf("Z ");
249 break;
250 default:
251 tprintf("? ");
252 break;
253 }
254 }
255 tprintf("\n");
256 }
257
258 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
259 std::vector<int> *word_indices) const {
260 std::vector<StrongScriptDirection> directions;
261 CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
262 }
263
264 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
265 std::vector<StrongScriptDirection> *dirs_arg,
266 std::vector<int> *word_indices) const {
267 std::vector<StrongScriptDirection> dirs;
268 std::vector<StrongScriptDirection> *directions;
269 directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
270 directions->clear();
271
272 // A LTRResultIterator goes strictly left-to-right word order.
273 LTRResultIterator ltr_it(resit);
274 ltr_it.RestartRow();
275 if (ltr_it.Empty(RIL_WORD)) {
276 return;
277 }
278 do {
279 directions->push_back(ltr_it.WordDirection());
280 } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
281
282 word_indices->clear();
283 CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
284 }
285
286 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,
287 const std::vector<StrongScriptDirection> &word_dirs,
288 std::vector<int> *reading_order) {
289 reading_order->clear();
290 if (word_dirs.empty()) {
291 return;
292 }
293
294 // Take all of the runs of minor direction words and insert them
295 // in reverse order.
296 int minor_direction, major_direction, major_step, start, end;
297 if (paragraph_is_ltr) {
298 start = 0;
299 end = word_dirs.size();
300 major_step = 1;
301 major_direction = DIR_LEFT_TO_RIGHT;
302 minor_direction = DIR_RIGHT_TO_LEFT;
303 } else {
304 start = word_dirs.size() - 1;
305 end = -1;
306 major_step = -1;
307 major_direction = DIR_RIGHT_TO_LEFT;
308 minor_direction = DIR_LEFT_TO_RIGHT;
309 // Special rule: if there are neutral words at the right most side
310 // of a line adjacent to a left-to-right word in the middle of the
311 // line, we interpret the end of the line as a single LTR sequence.
312 if (word_dirs[start] == DIR_NEUTRAL) {
313 int neutral_end = start;
314 while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
315 neutral_end--;
316 }
317 if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
318 // LTR followed by neutrals.
319 // Scan for the beginning of the minor left-to-right run.
320 int left = neutral_end;
321 for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
322 if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
323 left = i;
324 }
325 }
326 reading_order->push_back(kMinorRunStart);
327 for (unsigned i = left; i < word_dirs.size(); i++) {
328 reading_order->push_back(i);
329 if (word_dirs[i] == DIR_MIX) {
330 reading_order->push_back(kComplexWord);
331 }
332 }
333 reading_order->push_back(kMinorRunEnd);
334 start = left - 1;
335 }
336 }
337 }
338 for (int i = start; i != end;) {
339 if (word_dirs[i] == minor_direction) {
340 int j = i;
341 while (j != end && word_dirs[j] != major_direction) {
342 j += major_step;
343 }
344 if (j == end) {
345 j -= major_step;
346 }
347 while (j != i && word_dirs[j] != minor_direction) {
348 j -= major_step;
349 }
350 // [j..i] is a minor direction run.
351 reading_order->push_back(kMinorRunStart);
352 for (int k = j; k != i; k -= major_step) {
353 reading_order->push_back(k);
354 }
355 reading_order->push_back(i);
356 reading_order->push_back(kMinorRunEnd);
357 i = j + major_step;
358 } else {
359 reading_order->push_back(i);
360 if (word_dirs[i] == DIR_MIX) {
361 reading_order->push_back(kComplexWord);
362 }
363 i += major_step;
364 }
365 }
366 }
367
368 int ResultIterator::LTRWordIndex() const {
369 int this_word_index = 0;
370 LTRResultIterator textline(*this);
371 textline.RestartRow();
372 while (!textline.PositionedAtSameWord(it_)) {
373 this_word_index++;
374 textline.Next(RIL_WORD);
375 }
376 return this_word_index;
377 }
378
379 void ResultIterator::MoveToLogicalStartOfWord() {
380 if (word_length_ == 0) {
381 BeginWord(0);
382 return;
383 }
384 std::vector<int> blob_order;
385 CalculateBlobOrder(&blob_order);
386 if (blob_order.empty() || blob_order[0] == 0) {
387 return;
388 }
389 BeginWord(blob_order[0]);
390 }
391
392 bool ResultIterator::IsAtFinalSymbolOfWord() const {
393 if (!it_->word()) {
394 return true;
395 }
396 std::vector<int> blob_order;
397 CalculateBlobOrder(&blob_order);
398 return blob_order.empty() || blob_order.back() == blob_index_;
399 }
400
401 bool ResultIterator::IsAtFirstSymbolOfWord() const {
402 if (!it_->word()) {
403 return true;
404 }
405 std::vector<int> blob_order;
406 CalculateBlobOrder(&blob_order);
407 return blob_order.empty() || blob_order[0] == blob_index_;
408 }
409
410 void ResultIterator::AppendSuffixMarks(std::string *text) const {
411 if (!it_->word()) {
412 return;
413 }
414 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
415 // scan forward to see what meta-information the word ordering algorithm
416 // left us.
417 // If this word is at the *end* of a minor run, insert the other
418 // direction's mark; else if this was a complex word, insert the
419 // current reading order's mark.
420 std::vector<int> textline_order;
421 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
422 int this_word_index = LTRWordIndex();
423 size_t i = 0;
424 for (const auto word_index : textline_order) {
425 if (word_index == this_word_index) {
426 break;
427 }
428 i++;
429 }
430 if (i == textline_order.size()) {
431 return;
432 }
433
434 int last_non_word_mark = 0;
435 for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
436 last_non_word_mark = textline_order[i];
437 }
438 if (last_non_word_mark == kComplexWord) {
439 *text += reading_direction_is_ltr ? kLRM : kRLM;
440 } else if (last_non_word_mark == kMinorRunEnd) {
441 if (current_paragraph_is_ltr_) {
442 *text += kLRM;
443 } else {
444 *text += kRLM;
445 }
446 }
447 }
448
449 void ResultIterator::MoveToLogicalStartOfTextline() {
450 std::vector<int> word_indices;
451 RestartRow();
452 CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),
453 &word_indices);
454 unsigned i = 0;
455 for (; i < word_indices.size() && word_indices[i] < 0; i++) {
456 if (word_indices[i] == kMinorRunStart) {
457 in_minor_direction_ = true;
458 } else if (word_indices[i] == kMinorRunEnd) {
459 in_minor_direction_ = false;
460 }
461 }
462 if (in_minor_direction_) {
463 at_beginning_of_minor_run_ = true;
464 }
465 if (i >= word_indices.size()) {
466 return;
467 }
468 int first_word_index = word_indices[i];
469 for (int j = 0; j < first_word_index; j++) {
470 PageIterator::Next(RIL_WORD);
471 }
472 MoveToLogicalStartOfWord();
473 }
474
475 void ResultIterator::Begin() {
476 LTRResultIterator::Begin();
477 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
478 in_minor_direction_ = false;
479 at_beginning_of_minor_run_ = false;
480 MoveToLogicalStartOfTextline();
481 }
482
483 bool ResultIterator::Next(PageIteratorLevel level) {
484 if (it_->block() == nullptr) {
485 return false; // already at end!
486 }
487 switch (level) {
488 case RIL_BLOCK: // explicit fall-through
489 case RIL_PARA: // explicit fall-through
490 case RIL_TEXTLINE:
491 if (!PageIterator::Next(level)) {
492 return false;
493 }
494 if (IsWithinFirstTextlineOfParagraph()) {
495 // if we've advanced to a new paragraph,
496 // recalculate current_paragraph_is_ltr_
497 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
498 }
499 in_minor_direction_ = false;
500 MoveToLogicalStartOfTextline();
501 return it_->block() != nullptr;
502 case RIL_SYMBOL: {
503 std::vector<int> blob_order;
504 CalculateBlobOrder(&blob_order);
505 unsigned next_blob = 0;
506 while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
507 next_blob++;
508 }
509 next_blob++;
510 if (next_blob < blob_order.size()) {
511 // we're in the same word; simply advance one blob.
512 BeginWord(blob_order[next_blob]);
513 at_beginning_of_minor_run_ = false;
514 return true;
515 }
516 level = RIL_WORD; // we've fallen through to the next word.
517 }
518 // Fall through.
519 case RIL_WORD: // explicit fall-through.
520 {
521 if (it_->word() == nullptr) {
522 return Next(RIL_BLOCK);
523 }
524 std::vector<int> word_indices;
525 int this_word_index = LTRWordIndex();
526 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
527 int final_real_index = word_indices.size() - 1;
528 while (final_real_index > 0 && word_indices[final_real_index] < 0) {
529 final_real_index--;
530 }
531 for (int i = 0; i < final_real_index; i++) {
532 if (word_indices[i] == this_word_index) {
533 int j = i + 1;
534 for (; j < final_real_index && word_indices[j] < 0; j++) {
535 if (word_indices[j] == kMinorRunStart) {
536 in_minor_direction_ = true;
537 }
538 if (word_indices[j] == kMinorRunEnd) {
539 in_minor_direction_ = false;
540 }
541 }
542 at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
543 // awesome, we move to word_indices[j]
544 if (BidiDebug(3)) {
545 tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
546 }
547 PageIterator::RestartRow();
548 for (int k = 0; k < word_indices[j]; k++) {
549 PageIterator::Next(RIL_WORD);
550 }
551 MoveToLogicalStartOfWord();
552 return true;
553 }
554 }
555 if (BidiDebug(3)) {
556 tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
557 }
558 // we're going off the end of the text line.
559 return Next(RIL_TEXTLINE);
560 }
561 }
562 ASSERT_HOST(false); // shouldn't happen.
563 return false;
564 }
565
566 bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
567 if (it_->block() == nullptr) {
568 return false; // Already at the end!
569 }
570 if (it_->word() == nullptr) {
571 return true; // In an image block.
572 }
573 if (level == RIL_SYMBOL) {
574 return true; // Always at beginning of a symbol.
575 }
576
577 bool at_word_start = IsAtFirstSymbolOfWord();
578 if (level == RIL_WORD) {
579 return at_word_start;
580 }
581
582 ResultIterator line_start(*this);
583 // move to the first word in the line...
584 line_start.MoveToLogicalStartOfTextline();
585
586 bool at_textline_start = at_word_start && *line_start.it_ == *it_;
587 if (level == RIL_TEXTLINE) {
588 return at_textline_start;
589 }
590
591 // now we move to the left-most word...
592 line_start.RestartRow();
593 bool at_block_start =
594 at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
595 if (level == RIL_BLOCK) {
596 return at_block_start;
597 }
598
599 bool at_para_start =
600 at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
601 line_start.it_->prev_row()->row->para());
602 if (level == RIL_PARA) {
603 return at_para_start;
604 }
605
606 ASSERT_HOST(false); // shouldn't happen.
607 return false;
608 }
609
610 /**
611 * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
612 * change that the variable next is now a ResultIterator instead of a
613 * PageIterator.
614 */
615 bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
616 if (Empty(element)) {
617 return true; // Already at the end!
618 }
619 // The result is true if we step forward by element and find we are
620 // at the end of the page or at beginning of *all* levels in:
621 // [level, element).
622 // When there is more than one level difference between element and level,
623 // we could for instance move forward one symbol and still be at the first
624 // word on a line, so we also have to be at the first symbol in a word.
625 ResultIterator next(*this);
626 next.Next(element);
627 if (next.Empty(element)) {
628 return true; // Reached the end of the page.
629 }
630 while (element > level) {
631 element = static_cast<PageIteratorLevel>(element - 1);
632 if (!next.IsAtBeginningOf(element)) {
633 return false;
634 }
635 }
636 return true;
637 }
638
639 // Returns the number of blanks before the current word.
640 int ResultIterator::BlanksBeforeWord() const {
641 if (CurrentParagraphIsLtr()) {
642 return LTRResultIterator::BlanksBeforeWord();
643 }
644 return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
645 }
646
647 /**
648 * Returns the null terminated UTF-8 encoded text string for the current
649 * object at the given level. Use delete [] to free after use.
650 */
651 char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
652 if (it_->word() == nullptr) {
653 return nullptr; // Already at the end!
654 }
655 std::string text;
656 switch (level) {
657 case RIL_BLOCK: {
658 ResultIterator pp(*this);
659 do {
660 pp.AppendUTF8ParagraphText(&text);
661 } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
662 } break;
663 case RIL_PARA:
664 AppendUTF8ParagraphText(&text);
665 break;
666 case RIL_TEXTLINE: {
667 ResultIterator it(*this);
668 it.MoveToLogicalStartOfTextline();
669 it.IterateAndAppendUTF8TextlineText(&text);
670 } break;
671 case RIL_WORD:
672 AppendUTF8WordText(&text);
673 break;
674 case RIL_SYMBOL: {
675 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
676 if (at_beginning_of_minor_run_) {
677 text += reading_direction_is_ltr ? kLRM : kRLM;
678 }
679 text = it_->word()->BestUTF8(blob_index_, false);
680 if (IsAtFinalSymbolOfWord()) {
681 AppendSuffixMarks(&text);
682 }
683 } break;
684 }
685 return copy_string(text);
686 }
687 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
688 *ResultIterator::GetRawLSTMTimesteps() const {
689 if (it_->word() != nullptr) {
690 return &it_->word()->segmented_timesteps;
691 } else {
692 return nullptr;
693 }
694 }
695
696 std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()
697 const {
698 if (it_->word() != nullptr) {
699 return &it_->word()->CTC_symbol_choices;
700 } else {
701 return nullptr;
702 }
703 }
704
705 void ResultIterator::AppendUTF8WordText(std::string *text) const {
706 if (!it_->word()) {
707 return;
708 }
709 ASSERT_HOST(it_->word()->best_choice != nullptr);
710 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
711 if (at_beginning_of_minor_run_) {
712 *text += reading_direction_is_ltr ? kLRM : kRLM;
713 }
714
715 std::vector<int> blob_order;
716 CalculateBlobOrder(&blob_order);
717 for (int i : blob_order) {
718 *text += it_->word()->BestUTF8(i, false);
719 }
720 AppendSuffixMarks(text);
721 }
722
723 void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
724 if (Empty(RIL_WORD)) {
725 Next(RIL_WORD);
726 return;
727 }
728 if (BidiDebug(1)) {
729 std::vector<int> textline_order;
730 std::vector<StrongScriptDirection> dirs;
731 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);
732 tprintf("Strong Script dirs [%p/P=%s]: ",
733 static_cast<void *>(it_->row()),
734 current_paragraph_is_ltr_ ? "ltr" : "rtl");
735 PrintScriptDirs(dirs);
736 tprintf("Logical textline order [%p/P=%s]: ",
737 static_cast<void *>(it_->row()),
738 current_paragraph_is_ltr_ ? "ltr" : "rtl");
739 for (int i : textline_order) {
740 tprintf("%d ", i);
741 }
742 tprintf("\n");
743 }
744
745 int words_appended = 0;
746 do {
747 int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);
748 for (int i = 0; i < numSpaces; ++i) {
749 *text += " ";
750 }
751 AppendUTF8WordText(text);
752 words_appended++;
753 if (BidiDebug(2)) {
754 tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
755 }
756 } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
757 if (BidiDebug(1)) {
758 tprintf("%d words printed\n", words_appended);
759 }
760 *text += line_separator_;
761 // If we just finished a paragraph, add an extra newline.
762 if (IsAtBeginningOf(RIL_PARA)) {
763 *text += paragraph_separator_;
764 }
765 }
766
767 void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {
768 ResultIterator it(*this);
769 it.RestartParagraph();
770 it.MoveToLogicalStartOfTextline();
771 if (it.Empty(RIL_WORD)) {
772 return;
773 }
774 do {
775 it.IterateAndAppendUTF8TextlineText(text);
776 } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
777 }
778
779 bool ResultIterator::BidiDebug(int min_level) const {
780 int debug_level = 1;
781 auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
782 tesseract_->params()->int_params);
783 if (p != nullptr) {
784 debug_level = (int32_t)(*p);
785 }
786 return debug_level >= min_level;
787 }
788
789 } // namespace tesseract.