Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/resultiterator.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: resultiterator.cpp | |
| 3 // Description: Iterator for tesseract results that is capable of | |
| 4 // iterating in proper reading order over Bi Directional | |
| 5 // (e.g. mixed Hebrew and English) text. | |
| 6 // Author: David Eger | |
| 7 // | |
| 8 // (C) Copyright 2011, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 /////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #include <tesseract/resultiterator.h> | |
| 22 | |
| 23 #include "helpers.h" // for copy_string | |
| 24 #include "pageres.h" | |
| 25 #include "tesseractclass.h" | |
| 26 #include "unicharset.h" | |
| 27 | |
| 28 #include <allheaders.h> | |
| 29 | |
| 30 #include <set> | |
| 31 #include <vector> | |
| 32 | |
| 33 static const char *const kLRM = "\u200E"; // Left-to-Right Mark | |
| 34 static const char *const kRLM = "\u200F"; // Right-to-Left Mark | |
| 35 | |
| 36 namespace tesseract { | |
| 37 | |
| 38 ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) { | |
| 39 in_minor_direction_ = false; | |
| 40 at_beginning_of_minor_run_ = false; | |
| 41 preserve_interword_spaces_ = false; | |
| 42 | |
| 43 auto *p = ParamUtils::FindParam<BoolParam>( | |
| 44 "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params); | |
| 45 if (p != nullptr) { | |
| 46 preserve_interword_spaces_ = (bool)(*p); | |
| 47 } | |
| 48 | |
| 49 current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); | |
| 50 MoveToLogicalStartOfTextline(); | |
| 51 } | |
| 52 | |
| 53 ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) { | |
| 54 return new ResultIterator(resit); | |
| 55 } | |
| 56 | |
| 57 bool ResultIterator::ParagraphIsLtr() const { | |
| 58 return current_paragraph_is_ltr_; | |
| 59 } | |
| 60 | |
| 61 bool ResultIterator::CurrentParagraphIsLtr() const { | |
| 62 if (!it_->word()) { | |
| 63 return true; // doesn't matter. | |
| 64 } | |
| 65 LTRResultIterator it(*this); | |
| 66 it.RestartParagraph(); | |
| 67 // Try to figure out the ltr-ness of the paragraph. The rules below | |
| 68 // make more sense in the context of a difficult paragraph example. | |
| 69 // Here we denote {ltr characters, RTL CHARACTERS}: | |
| 70 // | |
| 71 // "don't go in there!" DAIS EH | |
| 72 // EHT OTNI DEPMUJ FELSMIH NEHT DNA | |
| 73 // .GNIDLIUB GNINRUB | |
| 74 // | |
| 75 // On the first line, the left-most word is LTR and the rightmost word | |
| 76 // is RTL. Thus, we are better off taking the majority direction for | |
| 77 // the whole paragraph contents. So instead of "the leftmost word is LTR" | |
| 78 // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs | |
| 79 // would not do: Typically an RTL paragraph would *not* start with an LTR | |
| 80 // word. So our heuristics are as follows: | |
| 81 // | |
| 82 // (1) If the first text line has an RTL word in the left-most position | |
| 83 // it is RTL. | |
| 84 // (2) If the first text line has an LTR word in the right-most position | |
| 85 // it is LTR. | |
| 86 // (3) If neither of the above is true, take the majority count for the | |
| 87 // paragraph -- if there are more rtl words, it is RTL. If there | |
| 88 // are more LTR words, it's LTR. | |
| 89 bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT; | |
| 90 bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT; | |
| 91 int num_ltr, num_rtl; | |
| 92 num_rtl = leftmost_rtl ? 1 : 0; | |
| 93 num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0; | |
| 94 for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE); | |
| 95 it.Next(RIL_WORD)) { | |
| 96 StrongScriptDirection dir = it.WordDirection(); | |
| 97 rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT); | |
| 98 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0; | |
| 99 num_ltr += rightmost_ltr ? 1 : 0; | |
| 100 } | |
| 101 if (leftmost_rtl) { | |
| 102 return false; | |
| 103 } | |
| 104 if (rightmost_ltr) { | |
| 105 return true; | |
| 106 } | |
| 107 // First line is ambiguous. Take statistics on the whole paragraph. | |
| 108 if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) { | |
| 109 do { | |
| 110 StrongScriptDirection dir = it.WordDirection(); | |
| 111 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0; | |
| 112 num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0; | |
| 113 } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)); | |
| 114 } | |
| 115 return num_ltr >= num_rtl; | |
| 116 } | |
| 117 | |
| 118 const int ResultIterator::kMinorRunStart = -1; | |
| 119 const int ResultIterator::kMinorRunEnd = -2; | |
| 120 const int ResultIterator::kComplexWord = -3; | |
| 121 | |
| 122 void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const { | |
| 123 bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; | |
| 124 blob_indices->clear(); | |
| 125 if (Empty(RIL_WORD)) { | |
| 126 return; | |
| 127 } | |
| 128 if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) { | |
| 129 // Easy! just return the blobs in order; | |
| 130 for (int i = 0; i < word_length_; i++) { | |
| 131 blob_indices->push_back(i); | |
| 132 } | |
| 133 return; | |
| 134 } | |
| 135 | |
| 136 // The blobs are in left-to-right order, but the current reading context | |
| 137 // is right-to-left. | |
| 138 const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT; | |
| 139 const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT; | |
| 140 const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER; | |
| 141 const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR; | |
| 142 const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR; | |
| 143 const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR; | |
| 144 const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL; | |
| 145 | |
| 146 // Step 1: Scan for and mark European Number sequences | |
| 147 // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]* | |
| 148 std::vector<int> letter_types; | |
| 149 letter_types.reserve(word_length_); | |
| 150 for (int i = 0; i < word_length_; i++) { | |
| 151 letter_types.push_back(it_->word()->SymbolDirection(i)); | |
| 152 } | |
| 153 // Convert a single separator sandwiched between two ENs into an EN. | |
| 154 for (int i = 0; i + 2 < word_length_; i++) { | |
| 155 if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM && | |
| 156 (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) { | |
| 157 letter_types[i + 1] = U_EURO_NUM; | |
| 158 } | |
| 159 } | |
| 160 // Scan for sequences of European Number Terminators around ENs and convert | |
| 161 // them to ENs. | |
| 162 for (int i = 0; i < word_length_; i++) { | |
| 163 if (letter_types[i] == U_EURO_NUM_TERM) { | |
| 164 int j = i + 1; | |
| 165 while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { | |
| 166 j++; | |
| 167 } | |
| 168 if (j < word_length_ && letter_types[j] == U_EURO_NUM) { | |
| 169 // The sequence [i..j] should be converted to all European Numbers. | |
| 170 for (int k = i; k < j; k++) { | |
| 171 letter_types[k] = U_EURO_NUM; | |
| 172 } | |
| 173 } | |
| 174 j = i - 1; | |
| 175 while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { | |
| 176 j--; | |
| 177 } | |
| 178 if (j > -1 && letter_types[j] == U_EURO_NUM) { | |
| 179 // The sequence [j..i] should be converted to all European Numbers. | |
| 180 for (int k = j; k <= i; k++) { | |
| 181 letter_types[k] = U_EURO_NUM; | |
| 182 } | |
| 183 } | |
| 184 } | |
| 185 } | |
| 186 // Step 2: Convert all remaining types to either L or R. | |
| 187 // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L. | |
| 188 // All other are R. | |
| 189 for (int i = 0; i < word_length_;) { | |
| 190 int ti = letter_types[i]; | |
| 191 if (ti == U_LTR || ti == U_EURO_NUM) { | |
| 192 // Left to right sequence; scan to the end of it. | |
| 193 int last_good = i; | |
| 194 for (int j = i + 1; j < word_length_; j++) { | |
| 195 int tj = letter_types[j]; | |
| 196 if (tj == U_LTR || tj == U_EURO_NUM) { | |
| 197 last_good = j; | |
| 198 } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) { | |
| 199 // do nothing. | |
| 200 } else { | |
| 201 break; | |
| 202 } | |
| 203 } | |
| 204 // [i..last_good] is the L sequence | |
| 205 for (int k = i; k <= last_good; k++) { | |
| 206 letter_types[k] = U_LTR; | |
| 207 } | |
| 208 i = last_good + 1; | |
| 209 } else { | |
| 210 letter_types[i] = U_RTL; | |
| 211 i++; | |
| 212 } | |
| 213 } | |
| 214 | |
| 215 // At this point, letter_types is entirely U_LTR or U_RTL. | |
| 216 for (int i = word_length_ - 1; i >= 0;) { | |
| 217 if (letter_types[i] == U_RTL) { | |
| 218 blob_indices->push_back(i); | |
| 219 i--; | |
| 220 } else { | |
| 221 // left to right sequence. scan to the beginning. | |
| 222 int j = i - 1; | |
| 223 for (; j >= 0 && letter_types[j] != U_RTL; j--) { | |
| 224 } // pass | |
| 225 // Now (j, i] is LTR | |
| 226 for (int k = j + 1; k <= i; k++) { | |
| 227 blob_indices->push_back(k); | |
| 228 } | |
| 229 i = j; | |
| 230 } | |
| 231 } | |
| 232 ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_)); | |
| 233 } | |
| 234 | |
| 235 static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) { | |
| 236 for (auto dir : dirs) { | |
| 237 switch (dir) { | |
| 238 case DIR_NEUTRAL: | |
| 239 tprintf("N "); | |
| 240 break; | |
| 241 case DIR_LEFT_TO_RIGHT: | |
| 242 tprintf("L "); | |
| 243 break; | |
| 244 case DIR_RIGHT_TO_LEFT: | |
| 245 tprintf("R "); | |
| 246 break; | |
| 247 case DIR_MIX: | |
| 248 tprintf("Z "); | |
| 249 break; | |
| 250 default: | |
| 251 tprintf("? "); | |
| 252 break; | |
| 253 } | |
| 254 } | |
| 255 tprintf("\n"); | |
| 256 } | |
| 257 | |
| 258 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit, | |
| 259 std::vector<int> *word_indices) const { | |
| 260 std::vector<StrongScriptDirection> directions; | |
| 261 CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices); | |
| 262 } | |
| 263 | |
| 264 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit, | |
| 265 std::vector<StrongScriptDirection> *dirs_arg, | |
| 266 std::vector<int> *word_indices) const { | |
| 267 std::vector<StrongScriptDirection> dirs; | |
| 268 std::vector<StrongScriptDirection> *directions; | |
| 269 directions = (dirs_arg != nullptr) ? dirs_arg : &dirs; | |
| 270 directions->clear(); | |
| 271 | |
| 272 // A LTRResultIterator goes strictly left-to-right word order. | |
| 273 LTRResultIterator ltr_it(resit); | |
| 274 ltr_it.RestartRow(); | |
| 275 if (ltr_it.Empty(RIL_WORD)) { | |
| 276 return; | |
| 277 } | |
| 278 do { | |
| 279 directions->push_back(ltr_it.WordDirection()); | |
| 280 } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE)); | |
| 281 | |
| 282 word_indices->clear(); | |
| 283 CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices); | |
| 284 } | |
| 285 | |
| 286 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, | |
| 287 const std::vector<StrongScriptDirection> &word_dirs, | |
| 288 std::vector<int> *reading_order) { | |
| 289 reading_order->clear(); | |
| 290 if (word_dirs.empty()) { | |
| 291 return; | |
| 292 } | |
| 293 | |
| 294 // Take all of the runs of minor direction words and insert them | |
| 295 // in reverse order. | |
| 296 int minor_direction, major_direction, major_step, start, end; | |
| 297 if (paragraph_is_ltr) { | |
| 298 start = 0; | |
| 299 end = word_dirs.size(); | |
| 300 major_step = 1; | |
| 301 major_direction = DIR_LEFT_TO_RIGHT; | |
| 302 minor_direction = DIR_RIGHT_TO_LEFT; | |
| 303 } else { | |
| 304 start = word_dirs.size() - 1; | |
| 305 end = -1; | |
| 306 major_step = -1; | |
| 307 major_direction = DIR_RIGHT_TO_LEFT; | |
| 308 minor_direction = DIR_LEFT_TO_RIGHT; | |
| 309 // Special rule: if there are neutral words at the right most side | |
| 310 // of a line adjacent to a left-to-right word in the middle of the | |
| 311 // line, we interpret the end of the line as a single LTR sequence. | |
| 312 if (word_dirs[start] == DIR_NEUTRAL) { | |
| 313 int neutral_end = start; | |
| 314 while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) { | |
| 315 neutral_end--; | |
| 316 } | |
| 317 if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) { | |
| 318 // LTR followed by neutrals. | |
| 319 // Scan for the beginning of the minor left-to-right run. | |
| 320 int left = neutral_end; | |
| 321 for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) { | |
| 322 if (word_dirs[i] == DIR_LEFT_TO_RIGHT) { | |
| 323 left = i; | |
| 324 } | |
| 325 } | |
| 326 reading_order->push_back(kMinorRunStart); | |
| 327 for (unsigned i = left; i < word_dirs.size(); i++) { | |
| 328 reading_order->push_back(i); | |
| 329 if (word_dirs[i] == DIR_MIX) { | |
| 330 reading_order->push_back(kComplexWord); | |
| 331 } | |
| 332 } | |
| 333 reading_order->push_back(kMinorRunEnd); | |
| 334 start = left - 1; | |
| 335 } | |
| 336 } | |
| 337 } | |
| 338 for (int i = start; i != end;) { | |
| 339 if (word_dirs[i] == minor_direction) { | |
| 340 int j = i; | |
| 341 while (j != end && word_dirs[j] != major_direction) { | |
| 342 j += major_step; | |
| 343 } | |
| 344 if (j == end) { | |
| 345 j -= major_step; | |
| 346 } | |
| 347 while (j != i && word_dirs[j] != minor_direction) { | |
| 348 j -= major_step; | |
| 349 } | |
| 350 // [j..i] is a minor direction run. | |
| 351 reading_order->push_back(kMinorRunStart); | |
| 352 for (int k = j; k != i; k -= major_step) { | |
| 353 reading_order->push_back(k); | |
| 354 } | |
| 355 reading_order->push_back(i); | |
| 356 reading_order->push_back(kMinorRunEnd); | |
| 357 i = j + major_step; | |
| 358 } else { | |
| 359 reading_order->push_back(i); | |
| 360 if (word_dirs[i] == DIR_MIX) { | |
| 361 reading_order->push_back(kComplexWord); | |
| 362 } | |
| 363 i += major_step; | |
| 364 } | |
| 365 } | |
| 366 } | |
| 367 | |
| 368 int ResultIterator::LTRWordIndex() const { | |
| 369 int this_word_index = 0; | |
| 370 LTRResultIterator textline(*this); | |
| 371 textline.RestartRow(); | |
| 372 while (!textline.PositionedAtSameWord(it_)) { | |
| 373 this_word_index++; | |
| 374 textline.Next(RIL_WORD); | |
| 375 } | |
| 376 return this_word_index; | |
| 377 } | |
| 378 | |
| 379 void ResultIterator::MoveToLogicalStartOfWord() { | |
| 380 if (word_length_ == 0) { | |
| 381 BeginWord(0); | |
| 382 return; | |
| 383 } | |
| 384 std::vector<int> blob_order; | |
| 385 CalculateBlobOrder(&blob_order); | |
| 386 if (blob_order.empty() || blob_order[0] == 0) { | |
| 387 return; | |
| 388 } | |
| 389 BeginWord(blob_order[0]); | |
| 390 } | |
| 391 | |
| 392 bool ResultIterator::IsAtFinalSymbolOfWord() const { | |
| 393 if (!it_->word()) { | |
| 394 return true; | |
| 395 } | |
| 396 std::vector<int> blob_order; | |
| 397 CalculateBlobOrder(&blob_order); | |
| 398 return blob_order.empty() || blob_order.back() == blob_index_; | |
| 399 } | |
| 400 | |
| 401 bool ResultIterator::IsAtFirstSymbolOfWord() const { | |
| 402 if (!it_->word()) { | |
| 403 return true; | |
| 404 } | |
| 405 std::vector<int> blob_order; | |
| 406 CalculateBlobOrder(&blob_order); | |
| 407 return blob_order.empty() || blob_order[0] == blob_index_; | |
| 408 } | |
| 409 | |
| 410 void ResultIterator::AppendSuffixMarks(std::string *text) const { | |
| 411 if (!it_->word()) { | |
| 412 return; | |
| 413 } | |
| 414 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; | |
| 415 // scan forward to see what meta-information the word ordering algorithm | |
| 416 // left us. | |
| 417 // If this word is at the *end* of a minor run, insert the other | |
| 418 // direction's mark; else if this was a complex word, insert the | |
| 419 // current reading order's mark. | |
| 420 std::vector<int> textline_order; | |
| 421 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order); | |
| 422 int this_word_index = LTRWordIndex(); | |
| 423 size_t i = 0; | |
| 424 for (const auto word_index : textline_order) { | |
| 425 if (word_index == this_word_index) { | |
| 426 break; | |
| 427 } | |
| 428 i++; | |
| 429 } | |
| 430 if (i == textline_order.size()) { | |
| 431 return; | |
| 432 } | |
| 433 | |
| 434 int last_non_word_mark = 0; | |
| 435 for (i++; i < textline_order.size() && textline_order[i] < 0; i++) { | |
| 436 last_non_word_mark = textline_order[i]; | |
| 437 } | |
| 438 if (last_non_word_mark == kComplexWord) { | |
| 439 *text += reading_direction_is_ltr ? kLRM : kRLM; | |
| 440 } else if (last_non_word_mark == kMinorRunEnd) { | |
| 441 if (current_paragraph_is_ltr_) { | |
| 442 *text += kLRM; | |
| 443 } else { | |
| 444 *text += kRLM; | |
| 445 } | |
| 446 } | |
| 447 } | |
| 448 | |
| 449 void ResultIterator::MoveToLogicalStartOfTextline() { | |
| 450 std::vector<int> word_indices; | |
| 451 RestartRow(); | |
| 452 CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this), | |
| 453 &word_indices); | |
| 454 unsigned i = 0; | |
| 455 for (; i < word_indices.size() && word_indices[i] < 0; i++) { | |
| 456 if (word_indices[i] == kMinorRunStart) { | |
| 457 in_minor_direction_ = true; | |
| 458 } else if (word_indices[i] == kMinorRunEnd) { | |
| 459 in_minor_direction_ = false; | |
| 460 } | |
| 461 } | |
| 462 if (in_minor_direction_) { | |
| 463 at_beginning_of_minor_run_ = true; | |
| 464 } | |
| 465 if (i >= word_indices.size()) { | |
| 466 return; | |
| 467 } | |
| 468 int first_word_index = word_indices[i]; | |
| 469 for (int j = 0; j < first_word_index; j++) { | |
| 470 PageIterator::Next(RIL_WORD); | |
| 471 } | |
| 472 MoveToLogicalStartOfWord(); | |
| 473 } | |
| 474 | |
| 475 void ResultIterator::Begin() { | |
| 476 LTRResultIterator::Begin(); | |
| 477 current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); | |
| 478 in_minor_direction_ = false; | |
| 479 at_beginning_of_minor_run_ = false; | |
| 480 MoveToLogicalStartOfTextline(); | |
| 481 } | |
| 482 | |
| 483 bool ResultIterator::Next(PageIteratorLevel level) { | |
| 484 if (it_->block() == nullptr) { | |
| 485 return false; // already at end! | |
| 486 } | |
| 487 switch (level) { | |
| 488 case RIL_BLOCK: // explicit fall-through | |
| 489 case RIL_PARA: // explicit fall-through | |
| 490 case RIL_TEXTLINE: | |
| 491 if (!PageIterator::Next(level)) { | |
| 492 return false; | |
| 493 } | |
| 494 if (IsWithinFirstTextlineOfParagraph()) { | |
| 495 // if we've advanced to a new paragraph, | |
| 496 // recalculate current_paragraph_is_ltr_ | |
| 497 current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); | |
| 498 } | |
| 499 in_minor_direction_ = false; | |
| 500 MoveToLogicalStartOfTextline(); | |
| 501 return it_->block() != nullptr; | |
| 502 case RIL_SYMBOL: { | |
| 503 std::vector<int> blob_order; | |
| 504 CalculateBlobOrder(&blob_order); | |
| 505 unsigned next_blob = 0; | |
| 506 while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) { | |
| 507 next_blob++; | |
| 508 } | |
| 509 next_blob++; | |
| 510 if (next_blob < blob_order.size()) { | |
| 511 // we're in the same word; simply advance one blob. | |
| 512 BeginWord(blob_order[next_blob]); | |
| 513 at_beginning_of_minor_run_ = false; | |
| 514 return true; | |
| 515 } | |
| 516 level = RIL_WORD; // we've fallen through to the next word. | |
| 517 } | |
| 518 // Fall through. | |
| 519 case RIL_WORD: // explicit fall-through. | |
| 520 { | |
| 521 if (it_->word() == nullptr) { | |
| 522 return Next(RIL_BLOCK); | |
| 523 } | |
| 524 std::vector<int> word_indices; | |
| 525 int this_word_index = LTRWordIndex(); | |
| 526 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices); | |
| 527 int final_real_index = word_indices.size() - 1; | |
| 528 while (final_real_index > 0 && word_indices[final_real_index] < 0) { | |
| 529 final_real_index--; | |
| 530 } | |
| 531 for (int i = 0; i < final_real_index; i++) { | |
| 532 if (word_indices[i] == this_word_index) { | |
| 533 int j = i + 1; | |
| 534 for (; j < final_real_index && word_indices[j] < 0; j++) { | |
| 535 if (word_indices[j] == kMinorRunStart) { | |
| 536 in_minor_direction_ = true; | |
| 537 } | |
| 538 if (word_indices[j] == kMinorRunEnd) { | |
| 539 in_minor_direction_ = false; | |
| 540 } | |
| 541 } | |
| 542 at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart); | |
| 543 // awesome, we move to word_indices[j] | |
| 544 if (BidiDebug(3)) { | |
| 545 tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]); | |
| 546 } | |
| 547 PageIterator::RestartRow(); | |
| 548 for (int k = 0; k < word_indices[j]; k++) { | |
| 549 PageIterator::Next(RIL_WORD); | |
| 550 } | |
| 551 MoveToLogicalStartOfWord(); | |
| 552 return true; | |
| 553 } | |
| 554 } | |
| 555 if (BidiDebug(3)) { | |
| 556 tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index); | |
| 557 } | |
| 558 // we're going off the end of the text line. | |
| 559 return Next(RIL_TEXTLINE); | |
| 560 } | |
| 561 } | |
| 562 ASSERT_HOST(false); // shouldn't happen. | |
| 563 return false; | |
| 564 } | |
| 565 | |
| 566 bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const { | |
| 567 if (it_->block() == nullptr) { | |
| 568 return false; // Already at the end! | |
| 569 } | |
| 570 if (it_->word() == nullptr) { | |
| 571 return true; // In an image block. | |
| 572 } | |
| 573 if (level == RIL_SYMBOL) { | |
| 574 return true; // Always at beginning of a symbol. | |
| 575 } | |
| 576 | |
| 577 bool at_word_start = IsAtFirstSymbolOfWord(); | |
| 578 if (level == RIL_WORD) { | |
| 579 return at_word_start; | |
| 580 } | |
| 581 | |
| 582 ResultIterator line_start(*this); | |
| 583 // move to the first word in the line... | |
| 584 line_start.MoveToLogicalStartOfTextline(); | |
| 585 | |
| 586 bool at_textline_start = at_word_start && *line_start.it_ == *it_; | |
| 587 if (level == RIL_TEXTLINE) { | |
| 588 return at_textline_start; | |
| 589 } | |
| 590 | |
| 591 // now we move to the left-most word... | |
| 592 line_start.RestartRow(); | |
| 593 bool at_block_start = | |
| 594 at_textline_start && line_start.it_->block() != line_start.it_->prev_block(); | |
| 595 if (level == RIL_BLOCK) { | |
| 596 return at_block_start; | |
| 597 } | |
| 598 | |
| 599 bool at_para_start = | |
| 600 at_block_start || (at_textline_start && line_start.it_->row()->row->para() != | |
| 601 line_start.it_->prev_row()->row->para()); | |
| 602 if (level == RIL_PARA) { | |
| 603 return at_para_start; | |
| 604 } | |
| 605 | |
| 606 ASSERT_HOST(false); // shouldn't happen. | |
| 607 return false; | |
| 608 } | |
| 609 | |
| 610 /** | |
| 611 * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the | |
| 612 * change that the variable next is now a ResultIterator instead of a | |
| 613 * PageIterator. | |
| 614 */ | |
| 615 bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const { | |
| 616 if (Empty(element)) { | |
| 617 return true; // Already at the end! | |
| 618 } | |
| 619 // The result is true if we step forward by element and find we are | |
| 620 // at the end of the page or at beginning of *all* levels in: | |
| 621 // [level, element). | |
| 622 // When there is more than one level difference between element and level, | |
| 623 // we could for instance move forward one symbol and still be at the first | |
| 624 // word on a line, so we also have to be at the first symbol in a word. | |
| 625 ResultIterator next(*this); | |
| 626 next.Next(element); | |
| 627 if (next.Empty(element)) { | |
| 628 return true; // Reached the end of the page. | |
| 629 } | |
| 630 while (element > level) { | |
| 631 element = static_cast<PageIteratorLevel>(element - 1); | |
| 632 if (!next.IsAtBeginningOf(element)) { | |
| 633 return false; | |
| 634 } | |
| 635 } | |
| 636 return true; | |
| 637 } | |
| 638 | |
| 639 // Returns the number of blanks before the current word. | |
| 640 int ResultIterator::BlanksBeforeWord() const { | |
| 641 if (CurrentParagraphIsLtr()) { | |
| 642 return LTRResultIterator::BlanksBeforeWord(); | |
| 643 } | |
| 644 return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1; | |
| 645 } | |
| 646 | |
| 647 /** | |
| 648 * Returns the null terminated UTF-8 encoded text string for the current | |
| 649 * object at the given level. Use delete [] to free after use. | |
| 650 */ | |
| 651 char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const { | |
| 652 if (it_->word() == nullptr) { | |
| 653 return nullptr; // Already at the end! | |
| 654 } | |
| 655 std::string text; | |
| 656 switch (level) { | |
| 657 case RIL_BLOCK: { | |
| 658 ResultIterator pp(*this); | |
| 659 do { | |
| 660 pp.AppendUTF8ParagraphText(&text); | |
| 661 } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block()); | |
| 662 } break; | |
| 663 case RIL_PARA: | |
| 664 AppendUTF8ParagraphText(&text); | |
| 665 break; | |
| 666 case RIL_TEXTLINE: { | |
| 667 ResultIterator it(*this); | |
| 668 it.MoveToLogicalStartOfTextline(); | |
| 669 it.IterateAndAppendUTF8TextlineText(&text); | |
| 670 } break; | |
| 671 case RIL_WORD: | |
| 672 AppendUTF8WordText(&text); | |
| 673 break; | |
| 674 case RIL_SYMBOL: { | |
| 675 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; | |
| 676 if (at_beginning_of_minor_run_) { | |
| 677 text += reading_direction_is_ltr ? kLRM : kRLM; | |
| 678 } | |
| 679 text = it_->word()->BestUTF8(blob_index_, false); | |
| 680 if (IsAtFinalSymbolOfWord()) { | |
| 681 AppendSuffixMarks(&text); | |
| 682 } | |
| 683 } break; | |
| 684 } | |
| 685 return copy_string(text); | |
| 686 } | |
| 687 std::vector<std::vector<std::vector<std::pair<const char *, float>>>> | |
| 688 *ResultIterator::GetRawLSTMTimesteps() const { | |
| 689 if (it_->word() != nullptr) { | |
| 690 return &it_->word()->segmented_timesteps; | |
| 691 } else { | |
| 692 return nullptr; | |
| 693 } | |
| 694 } | |
| 695 | |
| 696 std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices() | |
| 697 const { | |
| 698 if (it_->word() != nullptr) { | |
| 699 return &it_->word()->CTC_symbol_choices; | |
| 700 } else { | |
| 701 return nullptr; | |
| 702 } | |
| 703 } | |
| 704 | |
| 705 void ResultIterator::AppendUTF8WordText(std::string *text) const { | |
| 706 if (!it_->word()) { | |
| 707 return; | |
| 708 } | |
| 709 ASSERT_HOST(it_->word()->best_choice != nullptr); | |
| 710 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; | |
| 711 if (at_beginning_of_minor_run_) { | |
| 712 *text += reading_direction_is_ltr ? kLRM : kRLM; | |
| 713 } | |
| 714 | |
| 715 std::vector<int> blob_order; | |
| 716 CalculateBlobOrder(&blob_order); | |
| 717 for (int i : blob_order) { | |
| 718 *text += it_->word()->BestUTF8(i, false); | |
| 719 } | |
| 720 AppendSuffixMarks(text); | |
| 721 } | |
| 722 | |
| 723 void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) { | |
| 724 if (Empty(RIL_WORD)) { | |
| 725 Next(RIL_WORD); | |
| 726 return; | |
| 727 } | |
| 728 if (BidiDebug(1)) { | |
| 729 std::vector<int> textline_order; | |
| 730 std::vector<StrongScriptDirection> dirs; | |
| 731 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order); | |
| 732 tprintf("Strong Script dirs [%p/P=%s]: ", | |
| 733 static_cast<void *>(it_->row()), | |
| 734 current_paragraph_is_ltr_ ? "ltr" : "rtl"); | |
| 735 PrintScriptDirs(dirs); | |
| 736 tprintf("Logical textline order [%p/P=%s]: ", | |
| 737 static_cast<void *>(it_->row()), | |
| 738 current_paragraph_is_ltr_ ? "ltr" : "rtl"); | |
| 739 for (int i : textline_order) { | |
| 740 tprintf("%d ", i); | |
| 741 } | |
| 742 tprintf("\n"); | |
| 743 } | |
| 744 | |
| 745 int words_appended = 0; | |
| 746 do { | |
| 747 int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0); | |
| 748 for (int i = 0; i < numSpaces; ++i) { | |
| 749 *text += " "; | |
| 750 } | |
| 751 AppendUTF8WordText(text); | |
| 752 words_appended++; | |
| 753 if (BidiDebug(2)) { | |
| 754 tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str()); | |
| 755 } | |
| 756 } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE)); | |
| 757 if (BidiDebug(1)) { | |
| 758 tprintf("%d words printed\n", words_appended); | |
| 759 } | |
| 760 *text += line_separator_; | |
| 761 // If we just finished a paragraph, add an extra newline. | |
| 762 if (IsAtBeginningOf(RIL_PARA)) { | |
| 763 *text += paragraph_separator_; | |
| 764 } | |
| 765 } | |
| 766 | |
| 767 void ResultIterator::AppendUTF8ParagraphText(std::string *text) const { | |
| 768 ResultIterator it(*this); | |
| 769 it.RestartParagraph(); | |
| 770 it.MoveToLogicalStartOfTextline(); | |
| 771 if (it.Empty(RIL_WORD)) { | |
| 772 return; | |
| 773 } | |
| 774 do { | |
| 775 it.IterateAndAppendUTF8TextlineText(text); | |
| 776 } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA)); | |
| 777 } | |
| 778 | |
| 779 bool ResultIterator::BidiDebug(int min_level) const { | |
| 780 int debug_level = 1; | |
| 781 auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params, | |
| 782 tesseract_->params()->int_params); | |
| 783 if (p != nullptr) { | |
| 784 debug_level = (int32_t)(*p); | |
| 785 } | |
| 786 return debug_level >= min_level; | |
| 787 } | |
| 788 | |
| 789 } // namespace tesseract. |
