Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccmain/resultiterator.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccmain/resultiterator.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,789 @@ +/////////////////////////////////////////////////////////////////////// +// File: resultiterator.cpp +// Description: Iterator for tesseract results that is capable of +// iterating in proper reading order over Bi Directional +// (e.g. mixed Hebrew and English) text. +// Author: David Eger +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include <tesseract/resultiterator.h> + +#include "helpers.h" // for copy_string +#include "pageres.h" +#include "tesseractclass.h" +#include "unicharset.h" + +#include <allheaders.h> + +#include <set> +#include <vector> + +static const char *const kLRM = "\u200E"; // Left-to-Right Mark +static const char *const kRLM = "\u200F"; // Right-to-Left Mark + +namespace tesseract { + +ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) { + in_minor_direction_ = false; + at_beginning_of_minor_run_ = false; + preserve_interword_spaces_ = false; + + auto *p = ParamUtils::FindParam<BoolParam>( + "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params); + if (p != nullptr) { + preserve_interword_spaces_ = (bool)(*p); + } + + current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); + MoveToLogicalStartOfTextline(); +} + +ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) { + return new ResultIterator(resit); +} + +bool ResultIterator::ParagraphIsLtr() const { + return current_paragraph_is_ltr_; +} + +bool ResultIterator::CurrentParagraphIsLtr() const { + if (!it_->word()) { + return true; // doesn't matter. + } + LTRResultIterator it(*this); + it.RestartParagraph(); + // Try to figure out the ltr-ness of the paragraph. The rules below + // make more sense in the context of a difficult paragraph example. + // Here we denote {ltr characters, RTL CHARACTERS}: + // + // "don't go in there!" DAIS EH + // EHT OTNI DEPMUJ FELSMIH NEHT DNA + // .GNIDLIUB GNINRUB + // + // On the first line, the left-most word is LTR and the rightmost word + // is RTL. Thus, we are better off taking the majority direction for + // the whole paragraph contents. So instead of "the leftmost word is LTR" + // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs + // would not do: Typically an RTL paragraph would *not* start with an LTR + // word. So our heuristics are as follows: + // + // (1) If the first text line has an RTL word in the left-most position + // it is RTL. + // (2) If the first text line has an LTR word in the right-most position + // it is LTR. + // (3) If neither of the above is true, take the majority count for the + // paragraph -- if there are more rtl words, it is RTL. If there + // are more LTR words, it's LTR. + bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT; + bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT; + int num_ltr, num_rtl; + num_rtl = leftmost_rtl ? 1 : 0; + num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0; + for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE); + it.Next(RIL_WORD)) { + StrongScriptDirection dir = it.WordDirection(); + rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT); + num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0; + num_ltr += rightmost_ltr ? 1 : 0; + } + if (leftmost_rtl) { + return false; + } + if (rightmost_ltr) { + return true; + } + // First line is ambiguous. Take statistics on the whole paragraph. + if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) { + do { + StrongScriptDirection dir = it.WordDirection(); + num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0; + num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0; + } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)); + } + return num_ltr >= num_rtl; +} + +const int ResultIterator::kMinorRunStart = -1; +const int ResultIterator::kMinorRunEnd = -2; +const int ResultIterator::kComplexWord = -3; + +void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const { + bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; + blob_indices->clear(); + if (Empty(RIL_WORD)) { + return; + } + if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) { + // Easy! just return the blobs in order; + for (int i = 0; i < word_length_; i++) { + blob_indices->push_back(i); + } + return; + } + + // The blobs are in left-to-right order, but the current reading context + // is right-to-left. + const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT; + const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT; + const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER; + const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR; + const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR; + const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR; + const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL; + + // Step 1: Scan for and mark European Number sequences + // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]* + std::vector<int> letter_types; + letter_types.reserve(word_length_); + for (int i = 0; i < word_length_; i++) { + letter_types.push_back(it_->word()->SymbolDirection(i)); + } + // Convert a single separator sandwiched between two ENs into an EN. + for (int i = 0; i + 2 < word_length_; i++) { + if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM && + (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) { + letter_types[i + 1] = U_EURO_NUM; + } + } + // Scan for sequences of European Number Terminators around ENs and convert + // them to ENs. + for (int i = 0; i < word_length_; i++) { + if (letter_types[i] == U_EURO_NUM_TERM) { + int j = i + 1; + while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { + j++; + } + if (j < word_length_ && letter_types[j] == U_EURO_NUM) { + // The sequence [i..j] should be converted to all European Numbers. + for (int k = i; k < j; k++) { + letter_types[k] = U_EURO_NUM; + } + } + j = i - 1; + while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { + j--; + } + if (j > -1 && letter_types[j] == U_EURO_NUM) { + // The sequence [j..i] should be converted to all European Numbers. + for (int k = j; k <= i; k++) { + letter_types[k] = U_EURO_NUM; + } + } + } + } + // Step 2: Convert all remaining types to either L or R. + // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L. + // All other are R. + for (int i = 0; i < word_length_;) { + int ti = letter_types[i]; + if (ti == U_LTR || ti == U_EURO_NUM) { + // Left to right sequence; scan to the end of it. + int last_good = i; + for (int j = i + 1; j < word_length_; j++) { + int tj = letter_types[j]; + if (tj == U_LTR || tj == U_EURO_NUM) { + last_good = j; + } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) { + // do nothing. + } else { + break; + } + } + // [i..last_good] is the L sequence + for (int k = i; k <= last_good; k++) { + letter_types[k] = U_LTR; + } + i = last_good + 1; + } else { + letter_types[i] = U_RTL; + i++; + } + } + + // At this point, letter_types is entirely U_LTR or U_RTL. + for (int i = word_length_ - 1; i >= 0;) { + if (letter_types[i] == U_RTL) { + blob_indices->push_back(i); + i--; + } else { + // left to right sequence. scan to the beginning. + int j = i - 1; + for (; j >= 0 && letter_types[j] != U_RTL; j--) { + } // pass + // Now (j, i] is LTR + for (int k = j + 1; k <= i; k++) { + blob_indices->push_back(k); + } + i = j; + } + } + ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_)); +} + +static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) { + for (auto dir : dirs) { + switch (dir) { + case DIR_NEUTRAL: + tprintf("N "); + break; + case DIR_LEFT_TO_RIGHT: + tprintf("L "); + break; + case DIR_RIGHT_TO_LEFT: + tprintf("R "); + break; + case DIR_MIX: + tprintf("Z "); + break; + default: + tprintf("? "); + break; + } + } + tprintf("\n"); +} + +void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit, + std::vector<int> *word_indices) const { + std::vector<StrongScriptDirection> directions; + CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices); +} + +void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit, + std::vector<StrongScriptDirection> *dirs_arg, + std::vector<int> *word_indices) const { + std::vector<StrongScriptDirection> dirs; + std::vector<StrongScriptDirection> *directions; + directions = (dirs_arg != nullptr) ? dirs_arg : &dirs; + directions->clear(); + + // A LTRResultIterator goes strictly left-to-right word order. + LTRResultIterator ltr_it(resit); + ltr_it.RestartRow(); + if (ltr_it.Empty(RIL_WORD)) { + return; + } + do { + directions->push_back(ltr_it.WordDirection()); + } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE)); + + word_indices->clear(); + CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices); +} + +void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, + const std::vector<StrongScriptDirection> &word_dirs, + std::vector<int> *reading_order) { + reading_order->clear(); + if (word_dirs.empty()) { + return; + } + + // Take all of the runs of minor direction words and insert them + // in reverse order. + int minor_direction, major_direction, major_step, start, end; + if (paragraph_is_ltr) { + start = 0; + end = word_dirs.size(); + major_step = 1; + major_direction = DIR_LEFT_TO_RIGHT; + minor_direction = DIR_RIGHT_TO_LEFT; + } else { + start = word_dirs.size() - 1; + end = -1; + major_step = -1; + major_direction = DIR_RIGHT_TO_LEFT; + minor_direction = DIR_LEFT_TO_RIGHT; + // Special rule: if there are neutral words at the right most side + // of a line adjacent to a left-to-right word in the middle of the + // line, we interpret the end of the line as a single LTR sequence. + if (word_dirs[start] == DIR_NEUTRAL) { + int neutral_end = start; + while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) { + neutral_end--; + } + if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) { + // LTR followed by neutrals. + // Scan for the beginning of the minor left-to-right run. + int left = neutral_end; + for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) { + if (word_dirs[i] == DIR_LEFT_TO_RIGHT) { + left = i; + } + } + reading_order->push_back(kMinorRunStart); + for (unsigned i = left; i < word_dirs.size(); i++) { + reading_order->push_back(i); + if (word_dirs[i] == DIR_MIX) { + reading_order->push_back(kComplexWord); + } + } + reading_order->push_back(kMinorRunEnd); + start = left - 1; + } + } + } + for (int i = start; i != end;) { + if (word_dirs[i] == minor_direction) { + int j = i; + while (j != end && word_dirs[j] != major_direction) { + j += major_step; + } + if (j == end) { + j -= major_step; + } + while (j != i && word_dirs[j] != minor_direction) { + j -= major_step; + } + // [j..i] is a minor direction run. + reading_order->push_back(kMinorRunStart); + for (int k = j; k != i; k -= major_step) { + reading_order->push_back(k); + } + reading_order->push_back(i); + reading_order->push_back(kMinorRunEnd); + i = j + major_step; + } else { + reading_order->push_back(i); + if (word_dirs[i] == DIR_MIX) { + reading_order->push_back(kComplexWord); + } + i += major_step; + } + } +} + +int ResultIterator::LTRWordIndex() const { + int this_word_index = 0; + LTRResultIterator textline(*this); + textline.RestartRow(); + while (!textline.PositionedAtSameWord(it_)) { + this_word_index++; + textline.Next(RIL_WORD); + } + return this_word_index; +} + +void ResultIterator::MoveToLogicalStartOfWord() { + if (word_length_ == 0) { + BeginWord(0); + return; + } + std::vector<int> blob_order; + CalculateBlobOrder(&blob_order); + if (blob_order.empty() || blob_order[0] == 0) { + return; + } + BeginWord(blob_order[0]); +} + +bool ResultIterator::IsAtFinalSymbolOfWord() const { + if (!it_->word()) { + return true; + } + std::vector<int> blob_order; + CalculateBlobOrder(&blob_order); + return blob_order.empty() || blob_order.back() == blob_index_; +} + +bool ResultIterator::IsAtFirstSymbolOfWord() const { + if (!it_->word()) { + return true; + } + std::vector<int> blob_order; + CalculateBlobOrder(&blob_order); + return blob_order.empty() || blob_order[0] == blob_index_; +} + +void ResultIterator::AppendSuffixMarks(std::string *text) const { + if (!it_->word()) { + return; + } + bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; + // scan forward to see what meta-information the word ordering algorithm + // left us. + // If this word is at the *end* of a minor run, insert the other + // direction's mark; else if this was a complex word, insert the + // current reading order's mark. + std::vector<int> textline_order; + CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order); + int this_word_index = LTRWordIndex(); + size_t i = 0; + for (const auto word_index : textline_order) { + if (word_index == this_word_index) { + break; + } + i++; + } + if (i == textline_order.size()) { + return; + } + + int last_non_word_mark = 0; + for (i++; i < textline_order.size() && textline_order[i] < 0; i++) { + last_non_word_mark = textline_order[i]; + } + if (last_non_word_mark == kComplexWord) { + *text += reading_direction_is_ltr ? kLRM : kRLM; + } else if (last_non_word_mark == kMinorRunEnd) { + if (current_paragraph_is_ltr_) { + *text += kLRM; + } else { + *text += kRLM; + } + } +} + +void ResultIterator::MoveToLogicalStartOfTextline() { + std::vector<int> word_indices; + RestartRow(); + CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this), + &word_indices); + unsigned i = 0; + for (; i < word_indices.size() && word_indices[i] < 0; i++) { + if (word_indices[i] == kMinorRunStart) { + in_minor_direction_ = true; + } else if (word_indices[i] == kMinorRunEnd) { + in_minor_direction_ = false; + } + } + if (in_minor_direction_) { + at_beginning_of_minor_run_ = true; + } + if (i >= word_indices.size()) { + return; + } + int first_word_index = word_indices[i]; + for (int j = 0; j < first_word_index; j++) { + PageIterator::Next(RIL_WORD); + } + MoveToLogicalStartOfWord(); +} + +void ResultIterator::Begin() { + LTRResultIterator::Begin(); + current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); + in_minor_direction_ = false; + at_beginning_of_minor_run_ = false; + MoveToLogicalStartOfTextline(); +} + +bool ResultIterator::Next(PageIteratorLevel level) { + if (it_->block() == nullptr) { + return false; // already at end! + } + switch (level) { + case RIL_BLOCK: // explicit fall-through + case RIL_PARA: // explicit fall-through + case RIL_TEXTLINE: + if (!PageIterator::Next(level)) { + return false; + } + if (IsWithinFirstTextlineOfParagraph()) { + // if we've advanced to a new paragraph, + // recalculate current_paragraph_is_ltr_ + current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); + } + in_minor_direction_ = false; + MoveToLogicalStartOfTextline(); + return it_->block() != nullptr; + case RIL_SYMBOL: { + std::vector<int> blob_order; + CalculateBlobOrder(&blob_order); + unsigned next_blob = 0; + while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) { + next_blob++; + } + next_blob++; + if (next_blob < blob_order.size()) { + // we're in the same word; simply advance one blob. + BeginWord(blob_order[next_blob]); + at_beginning_of_minor_run_ = false; + return true; + } + level = RIL_WORD; // we've fallen through to the next word. + } + // Fall through. + case RIL_WORD: // explicit fall-through. + { + if (it_->word() == nullptr) { + return Next(RIL_BLOCK); + } + std::vector<int> word_indices; + int this_word_index = LTRWordIndex(); + CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices); + int final_real_index = word_indices.size() - 1; + while (final_real_index > 0 && word_indices[final_real_index] < 0) { + final_real_index--; + } + for (int i = 0; i < final_real_index; i++) { + if (word_indices[i] == this_word_index) { + int j = i + 1; + for (; j < final_real_index && word_indices[j] < 0; j++) { + if (word_indices[j] == kMinorRunStart) { + in_minor_direction_ = true; + } + if (word_indices[j] == kMinorRunEnd) { + in_minor_direction_ = false; + } + } + at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart); + // awesome, we move to word_indices[j] + if (BidiDebug(3)) { + tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]); + } + PageIterator::RestartRow(); + for (int k = 0; k < word_indices[j]; k++) { + PageIterator::Next(RIL_WORD); + } + MoveToLogicalStartOfWord(); + return true; + } + } + if (BidiDebug(3)) { + tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index); + } + // we're going off the end of the text line. + return Next(RIL_TEXTLINE); + } + } + ASSERT_HOST(false); // shouldn't happen. + return false; +} + +bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const { + if (it_->block() == nullptr) { + return false; // Already at the end! + } + if (it_->word() == nullptr) { + return true; // In an image block. + } + if (level == RIL_SYMBOL) { + return true; // Always at beginning of a symbol. + } + + bool at_word_start = IsAtFirstSymbolOfWord(); + if (level == RIL_WORD) { + return at_word_start; + } + + ResultIterator line_start(*this); + // move to the first word in the line... + line_start.MoveToLogicalStartOfTextline(); + + bool at_textline_start = at_word_start && *line_start.it_ == *it_; + if (level == RIL_TEXTLINE) { + return at_textline_start; + } + + // now we move to the left-most word... + line_start.RestartRow(); + bool at_block_start = + at_textline_start && line_start.it_->block() != line_start.it_->prev_block(); + if (level == RIL_BLOCK) { + return at_block_start; + } + + bool at_para_start = + at_block_start || (at_textline_start && line_start.it_->row()->row->para() != + line_start.it_->prev_row()->row->para()); + if (level == RIL_PARA) { + return at_para_start; + } + + ASSERT_HOST(false); // shouldn't happen. + return false; +} + +/** + * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the + * change that the variable next is now a ResultIterator instead of a + * PageIterator. + */ +bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const { + if (Empty(element)) { + return true; // Already at the end! + } + // The result is true if we step forward by element and find we are + // at the end of the page or at beginning of *all* levels in: + // [level, element). + // When there is more than one level difference between element and level, + // we could for instance move forward one symbol and still be at the first + // word on a line, so we also have to be at the first symbol in a word. + ResultIterator next(*this); + next.Next(element); + if (next.Empty(element)) { + return true; // Reached the end of the page. + } + while (element > level) { + element = static_cast<PageIteratorLevel>(element - 1); + if (!next.IsAtBeginningOf(element)) { + return false; + } + } + return true; +} + +// Returns the number of blanks before the current word. +int ResultIterator::BlanksBeforeWord() const { + if (CurrentParagraphIsLtr()) { + return LTRResultIterator::BlanksBeforeWord(); + } + return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1; +} + +/** + * Returns the null terminated UTF-8 encoded text string for the current + * object at the given level. Use delete [] to free after use. + */ +char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const { + if (it_->word() == nullptr) { + return nullptr; // Already at the end! + } + std::string text; + switch (level) { + case RIL_BLOCK: { + ResultIterator pp(*this); + do { + pp.AppendUTF8ParagraphText(&text); + } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block()); + } break; + case RIL_PARA: + AppendUTF8ParagraphText(&text); + break; + case RIL_TEXTLINE: { + ResultIterator it(*this); + it.MoveToLogicalStartOfTextline(); + it.IterateAndAppendUTF8TextlineText(&text); + } break; + case RIL_WORD: + AppendUTF8WordText(&text); + break; + case RIL_SYMBOL: { + bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; + if (at_beginning_of_minor_run_) { + text += reading_direction_is_ltr ? kLRM : kRLM; + } + text = it_->word()->BestUTF8(blob_index_, false); + if (IsAtFinalSymbolOfWord()) { + AppendSuffixMarks(&text); + } + } break; + } + return copy_string(text); +} +std::vector<std::vector<std::vector<std::pair<const char *, float>>>> + *ResultIterator::GetRawLSTMTimesteps() const { + if (it_->word() != nullptr) { + return &it_->word()->segmented_timesteps; + } else { + return nullptr; + } +} + +std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices() + const { + if (it_->word() != nullptr) { + return &it_->word()->CTC_symbol_choices; + } else { + return nullptr; + } +} + +void ResultIterator::AppendUTF8WordText(std::string *text) const { + if (!it_->word()) { + return; + } + ASSERT_HOST(it_->word()->best_choice != nullptr); + bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; + if (at_beginning_of_minor_run_) { + *text += reading_direction_is_ltr ? kLRM : kRLM; + } + + std::vector<int> blob_order; + CalculateBlobOrder(&blob_order); + for (int i : blob_order) { + *text += it_->word()->BestUTF8(i, false); + } + AppendSuffixMarks(text); +} + +void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) { + if (Empty(RIL_WORD)) { + Next(RIL_WORD); + return; + } + if (BidiDebug(1)) { + std::vector<int> textline_order; + std::vector<StrongScriptDirection> dirs; + CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order); + tprintf("Strong Script dirs [%p/P=%s]: ", + static_cast<void *>(it_->row()), + current_paragraph_is_ltr_ ? "ltr" : "rtl"); + PrintScriptDirs(dirs); + tprintf("Logical textline order [%p/P=%s]: ", + static_cast<void *>(it_->row()), + current_paragraph_is_ltr_ ? "ltr" : "rtl"); + for (int i : textline_order) { + tprintf("%d ", i); + } + tprintf("\n"); + } + + int words_appended = 0; + do { + int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0); + for (int i = 0; i < numSpaces; ++i) { + *text += " "; + } + AppendUTF8WordText(text); + words_appended++; + if (BidiDebug(2)) { + tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str()); + } + } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE)); + if (BidiDebug(1)) { + tprintf("%d words printed\n", words_appended); + } + *text += line_separator_; + // If we just finished a paragraph, add an extra newline. + if (IsAtBeginningOf(RIL_PARA)) { + *text += paragraph_separator_; + } +} + +void ResultIterator::AppendUTF8ParagraphText(std::string *text) const { + ResultIterator it(*this); + it.RestartParagraph(); + it.MoveToLogicalStartOfTextline(); + if (it.Empty(RIL_WORD)) { + return; + } + do { + it.IterateAndAppendUTF8TextlineText(text); + } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA)); +} + +bool ResultIterator::BidiDebug(int min_level) const { + int debug_level = 1; + auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params, + tesseract_->params()->int_params); + if (p != nullptr) { + debug_level = (int32_t)(*p); + } + return debug_level >= min_level; +} + +} // namespace tesseract.
