Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: paragraphs.cpp | |
| 3 * Description: Paragraph detection for tesseract. | |
| 4 * Author: David Eger | |
| 5 * | |
| 6 * (C) Copyright 2011, Google Inc. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include "paragraphs.h" | |
| 20 | |
| 21 #include "helpers.h" // for UpdateRange, ClipToRange | |
| 22 #include "host.h" // for NearlyEqual | |
| 23 #include "mutableiterator.h" // for MutableIterator | |
| 24 #include "ocrblock.h" // for BLOCK | |
| 25 #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA... | |
| 26 #include "ocrrow.h" // for ROW | |
| 27 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO... | |
| 28 #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels | |
| 29 #include "pdblock.h" // for PDBLK | |
| 30 #include "polyblk.h" // for POLY_BLOCK | |
| 31 #include "ratngs.h" // for WERD_CHOICE | |
| 32 #include "rect.h" // for TBOX | |
| 33 #include "statistc.h" // for STATS | |
| 34 #include "tesserrstream.h" // for tesserr | |
| 35 #include "tprintf.h" // for tprintf | |
| 36 #include "unicharset.h" // for UNICHARSET | |
| 37 #include "werd.h" // for WERD, W_REP_CHAR | |
| 38 | |
| 39 #include <tesseract/pageiterator.h> // for PageIterator | |
| 40 #include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R... | |
| 41 #include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID | |
| 42 | |
| 43 #include <algorithm> // for max | |
| 44 #include <cctype> // for isspace | |
| 45 #include <cmath> // for abs | |
| 46 #include <cstdio> // for snprintf | |
| 47 #include <cstdlib> // for abs | |
| 48 #include <cstring> // for strchr, strlen | |
| 49 #include <memory> // for unique_ptr | |
| 50 | |
| 51 static const char *const kRLE = "\u202A"; // Right-to-Left Embedding | |
| 52 static const char *const kPDF = "\u202C"; // Pop Directional Formatting | |
| 53 | |
| 54 namespace tesseract { | |
| 55 | |
| 56 // Special "weak" ParagraphModels. | |
| 57 const ParagraphModel *kCrownLeft = | |
| 58 reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F)); | |
| 59 const ParagraphModel *kCrownRight = | |
| 60 reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F)); | |
| 61 | |
| 62 // Do the text and geometry of two rows support a paragraph break between them? | |
| 63 static bool LikelyParagraphStart(const RowScratchRegisters &before, | |
| 64 const RowScratchRegisters &after, | |
| 65 tesseract::ParagraphJustification j); | |
| 66 | |
| 67 // Given the width of a typical space between words, what is the threshold | |
| 68 // by which by which we think left and right alignments for paragraphs | |
| 69 // can vary and still be aligned. | |
| 70 static int Epsilon(int space_pix) { | |
| 71 return space_pix * 4 / 5; | |
| 72 } | |
| 73 | |
| 74 static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name, | |
| 75 const std::vector<RowScratchRegisters> *rows, int row_start, | |
| 76 int row_end) { | |
| 77 if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) { | |
| 78 tesserr << "Invalid arguments rows[" << row_start << ", " << row_end | |
| 79 << ") while rows is of size " << rows->size() << ".\n"; | |
| 80 return false; | |
| 81 } | |
| 82 if (row_end - row_start < min_num_rows) { | |
| 83 if (debug_level > 1) { | |
| 84 tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name); | |
| 85 } | |
| 86 return false; | |
| 87 } | |
| 88 return true; | |
| 89 } | |
| 90 | |
| 91 // =============================== Debug Code ================================ | |
| 92 | |
| 93 // Given a row-major matrix of unicode text and a column separator, print | |
| 94 // a formatted table. For ASCII, we get good column alignment. | |
| 95 static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) { | |
| 96 std::vector<int> max_col_widths; | |
| 97 for (const auto &row : rows) { | |
| 98 auto num_columns = row.size(); | |
| 99 for (size_t c = 0; c < num_columns; c++) { | |
| 100 int num_unicodes = 0; | |
| 101 for (char i : row[c]) { | |
| 102 if ((i & 0xC0) != 0x80) { | |
| 103 num_unicodes++; | |
| 104 } | |
| 105 } | |
| 106 if (c >= max_col_widths.size()) { | |
| 107 max_col_widths.push_back(num_unicodes); | |
| 108 } else { | |
| 109 if (num_unicodes > max_col_widths[c]) { | |
| 110 max_col_widths[c] = num_unicodes; | |
| 111 } | |
| 112 } | |
| 113 } | |
| 114 } | |
| 115 | |
| 116 std::vector<std::string> col_width_patterns; | |
| 117 col_width_patterns.reserve(max_col_widths.size()); | |
| 118 for (int max_col_width : max_col_widths) { | |
| 119 col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s"); | |
| 120 } | |
| 121 | |
| 122 for (const auto &row : rows) { | |
| 123 for (unsigned c = 0; c < row.size(); c++) { | |
| 124 if (c > 0) { | |
| 125 tprintf("%s", colsep); | |
| 126 } | |
| 127 tprintf(col_width_patterns[c].c_str(), row[c].c_str()); | |
| 128 } | |
| 129 tprintf("\n"); | |
| 130 } | |
| 131 } | |
| 132 | |
| 133 static std::string RtlEmbed(const std::string &word, bool rtlify) { | |
| 134 if (rtlify) { | |
| 135 return std::string(kRLE) + word + std::string(kPDF); | |
| 136 } | |
| 137 return word; | |
| 138 } | |
| 139 | |
| 140 // Print the current thoughts of the paragraph detector. | |
| 141 static void PrintDetectorState(const ParagraphTheory &theory, | |
| 142 const std::vector<RowScratchRegisters> &rows) { | |
| 143 std::vector<std::vector<std::string>> output; | |
| 144 output.emplace_back(); | |
| 145 output.back().push_back("#row"); | |
| 146 output.back().push_back("space"); | |
| 147 output.back().push_back(".."); | |
| 148 output.back().push_back("lword[widthSEL]"); | |
| 149 output.back().push_back("rword[widthSEL]"); | |
| 150 RowScratchRegisters::AppendDebugHeaderFields(output.back()); | |
| 151 output.back().push_back("text"); | |
| 152 | |
| 153 for (unsigned i = 0; i < rows.size(); i++) { | |
| 154 output.emplace_back(); | |
| 155 std::vector<std::string> &row = output.back(); | |
| 156 const RowInfo &ri = *rows[i].ri_; | |
| 157 row.push_back(std::to_string(i)); | |
| 158 row.push_back(std::to_string(ri.average_interword_space)); | |
| 159 row.emplace_back(ri.has_leaders ? ".." : " "); | |
| 160 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) + | |
| 161 (ri.lword_likely_starts_idea ? "S" : "s") + | |
| 162 (ri.lword_likely_ends_idea ? "E" : "e") + | |
| 163 (ri.lword_indicates_list_item ? "L" : "l") + "]"); | |
| 164 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) + | |
| 165 (ri.rword_likely_starts_idea ? "S" : "s") + | |
| 166 (ri.rword_likely_ends_idea ? "E" : "e") + | |
| 167 (ri.rword_indicates_list_item ? "L" : "l") + "]"); | |
| 168 rows[i].AppendDebugInfo(theory, row); | |
| 169 row.push_back(RtlEmbed(ri.text, !ri.ltr)); | |
| 170 } | |
| 171 PrintTable(output, " "); | |
| 172 | |
| 173 tprintf("Active Paragraph Models:\n"); | |
| 174 unsigned m = 0; | |
| 175 for (const auto &model : theory.models()) { | |
| 176 tprintf(" %d: %s\n", ++m, model->ToString().c_str()); | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory, | |
| 181 const std::vector<RowScratchRegisters> &rows) { | |
| 182 if (!should_print) { | |
| 183 return; | |
| 184 } | |
| 185 tprintf("# %s\n", phase); | |
| 186 PrintDetectorState(theory, rows); | |
| 187 } | |
| 188 | |
| 189 // Print out the text for rows[row_start, row_end) | |
| 190 static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start, | |
| 191 int row_end) { | |
| 192 tprintf("======================================\n"); | |
| 193 for (int row = row_start; row < row_end; row++) { | |
| 194 tprintf("%s\n", rows[row].ri_->text.c_str()); | |
| 195 } | |
| 196 tprintf("======================================\n"); | |
| 197 } | |
| 198 | |
| 199 // ============= Brain Dead Language Model (ASCII Version) =================== | |
| 200 | |
| 201 static bool IsLatinLetter(int ch) { | |
| 202 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); | |
| 203 } | |
| 204 | |
| 205 static bool IsDigitLike(int ch) { | |
| 206 return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; | |
| 207 } | |
| 208 | |
| 209 static bool IsOpeningPunct(int ch) { | |
| 210 return strchr("'\"({[", ch) != nullptr; | |
| 211 } | |
| 212 | |
| 213 static bool IsTerminalPunct(int ch) { | |
| 214 return strchr(":'\".?!]})", ch) != nullptr; | |
| 215 } | |
| 216 | |
| 217 // Return a pointer after consuming as much text as qualifies as roman numeral. | |
| 218 static const char *SkipChars(const char *str, const char *toskip) { | |
| 219 while (*str != '\0' && strchr(toskip, *str)) { | |
| 220 str++; | |
| 221 } | |
| 222 return str; | |
| 223 } | |
| 224 | |
| 225 static const char *SkipChars(const char *str, bool (*skip)(int)) { | |
| 226 while (*str != '\0' && skip(*str)) { | |
| 227 str++; | |
| 228 } | |
| 229 return str; | |
| 230 } | |
| 231 | |
| 232 static const char *SkipOne(const char *str, const char *toskip) { | |
| 233 if (*str != '\0' && strchr(toskip, *str)) { | |
| 234 return str + 1; | |
| 235 } | |
| 236 return str; | |
| 237 } | |
| 238 | |
| 239 // Return whether it is very likely that this is a numeral marker that could | |
| 240 // start a list item. Some examples include: | |
| 241 // A I iii. VI (2) 3.5. [C-4] | |
| 242 static bool LikelyListNumeral(const std::string &word) { | |
| 243 const char *kRomans = "ivxlmdIVXLMD"; | |
| 244 const char *kDigits = "012345789"; | |
| 245 const char *kOpen = "[{("; | |
| 246 const char *kSep = ":;-.,"; | |
| 247 const char *kClose = "]})"; | |
| 248 | |
| 249 int num_segments = 0; | |
| 250 const char *pos = word.c_str(); | |
| 251 while (*pos != '\0' && num_segments < 3) { | |
| 252 // skip up to two open parens. | |
| 253 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); | |
| 254 const char *numeral_end = SkipChars(numeral_start, kRomans); | |
| 255 if (numeral_end != numeral_start) { | |
| 256 // Got Roman Numeral. Great. | |
| 257 } else { | |
| 258 numeral_end = SkipChars(numeral_start, kDigits); | |
| 259 if (numeral_end == numeral_start) { | |
| 260 // If there's a single latin letter, we can use that. | |
| 261 numeral_end = SkipChars(numeral_start, IsLatinLetter); | |
| 262 if (numeral_end - numeral_start != 1) { | |
| 263 break; | |
| 264 } | |
| 265 } | |
| 266 } | |
| 267 // We got some sort of numeral. | |
| 268 num_segments++; | |
| 269 // Skip any trailing parens or punctuation. | |
| 270 pos = SkipChars(SkipChars(numeral_end, kClose), kSep); | |
| 271 if (pos == numeral_end) { | |
| 272 break; | |
| 273 } | |
| 274 } | |
| 275 return *pos == '\0'; | |
| 276 } | |
| 277 | |
| 278 static bool LikelyListMark(const std::string &word) { | |
| 279 const char *kListMarks = "0Oo*.,+."; | |
| 280 return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr; | |
| 281 } | |
| 282 | |
| 283 bool AsciiLikelyListItem(const std::string &word) { | |
| 284 return LikelyListMark(word) || LikelyListNumeral(word); | |
| 285 } | |
| 286 | |
| 287 // ========== Brain Dead Language Model (Tesseract Version) ================ | |
| 288 | |
| 289 // Return the first Unicode Codepoint from werd[pos]. | |
| 290 static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) { | |
| 291 if (!u || !werd || pos > werd->length()) { | |
| 292 return 0; | |
| 293 } | |
| 294 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); | |
| 295 } | |
| 296 | |
| 297 // A useful helper class for finding the first j >= i so that word[j] | |
| 298 // does not have given character type. | |
| 299 class UnicodeSpanSkipper { | |
| 300 public: | |
| 301 UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) | |
| 302 : u_(unicharset), word_(word), wordlen_(word->length()) { | |
| 303 } | |
| 304 | |
| 305 // Given an input position, return the first position >= pos not punc. | |
| 306 unsigned SkipPunc(unsigned pos); | |
| 307 // Given an input position, return the first position >= pos not digit. | |
| 308 unsigned SkipDigits(unsigned pos); | |
| 309 // Given an input position, return the first position >= pos not roman. | |
| 310 unsigned SkipRomans(unsigned pos); | |
| 311 // Given an input position, return the first position >= pos not alpha. | |
| 312 unsigned SkipAlpha(unsigned pos); | |
| 313 | |
| 314 private: | |
| 315 const UNICHARSET *u_; | |
| 316 const WERD_CHOICE *word_; | |
| 317 unsigned wordlen_; | |
| 318 }; | |
| 319 | |
| 320 unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) { | |
| 321 while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) { | |
| 322 pos++; | |
| 323 } | |
| 324 return pos; | |
| 325 } | |
| 326 | |
| 327 unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) { | |
| 328 while (pos < wordlen_ && | |
| 329 (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) { | |
| 330 pos++; | |
| 331 } | |
| 332 return pos; | |
| 333 } | |
| 334 | |
| 335 unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) { | |
| 336 const char *kRomans = "ivxlmdIVXLMD"; | |
| 337 while (pos < wordlen_) { | |
| 338 int ch = UnicodeFor(u_, word_, pos); | |
| 339 if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) { | |
| 340 break; | |
| 341 } | |
| 342 pos++; | |
| 343 } | |
| 344 return pos; | |
| 345 } | |
| 346 | |
| 347 unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) { | |
| 348 while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) { | |
| 349 pos++; | |
| 350 } | |
| 351 return pos; | |
| 352 } | |
| 353 | |
| 354 static bool LikelyListMarkUnicode(int ch) { | |
| 355 if (ch < 0x80) { | |
| 356 std::string single_ch; | |
| 357 single_ch += ch; | |
| 358 return LikelyListMark(single_ch); | |
| 359 } | |
| 360 switch (ch) { | |
| 361 // TODO(eger) expand this list of unicodes as needed. | |
| 362 case 0x00B0: // degree sign | |
| 363 case 0x2022: // bullet | |
| 364 case 0x25E6: // white bullet | |
| 365 case 0x00B7: // middle dot | |
| 366 case 0x25A1: // white square | |
| 367 case 0x25A0: // black square | |
| 368 case 0x25AA: // black small square | |
| 369 case 0x2B1D: // black very small square | |
| 370 case 0x25BA: // black right-pointing pointer | |
| 371 case 0x25CF: // black circle | |
| 372 case 0x25CB: // white circle | |
| 373 return true; | |
| 374 default: | |
| 375 break; // fall through | |
| 376 } | |
| 377 return false; | |
| 378 } | |
| 379 | |
| 380 // Return whether it is very likely that this is a numeral marker that could | |
| 381 // start a list item. Some examples include: | |
| 382 // A I iii. VI (2) 3.5. [C-4] | |
| 383 static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { | |
| 384 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) { | |
| 385 return true; | |
| 386 } | |
| 387 | |
| 388 UnicodeSpanSkipper m(u, werd); | |
| 389 int num_segments = 0; | |
| 390 unsigned pos = 0; | |
| 391 while (pos < werd->length() && num_segments < 3) { | |
| 392 auto numeral_start = m.SkipPunc(pos); | |
| 393 if (numeral_start > pos + 1) { | |
| 394 break; | |
| 395 } | |
| 396 auto numeral_end = m.SkipRomans(numeral_start); | |
| 397 if (numeral_end == numeral_start) { | |
| 398 numeral_end = m.SkipDigits(numeral_start); | |
| 399 if (numeral_end == numeral_start) { | |
| 400 // If there's a single latin letter, we can use that. | |
| 401 numeral_end = m.SkipAlpha(numeral_start); | |
| 402 if (numeral_end - numeral_start != 1) { | |
| 403 break; | |
| 404 } | |
| 405 } | |
| 406 } | |
| 407 // We got some sort of numeral. | |
| 408 num_segments++; | |
| 409 // Skip any trailing punctuation. | |
| 410 pos = m.SkipPunc(numeral_end); | |
| 411 if (pos == numeral_end) { | |
| 412 break; | |
| 413 } | |
| 414 } | |
| 415 return pos == werd->length(); | |
| 416 } | |
| 417 | |
| 418 template<class T> | |
| 419 void push_back_new(std::vector<T> &vector, const T &data) { | |
| 420 if (std::find(vector.begin(), vector.end(), data) == vector.end()) { | |
| 421 vector.push_back(data); | |
| 422 } | |
| 423 } | |
| 424 | |
| 425 // ========= Brain Dead Language Model (combined entry points) ================ | |
| 426 | |
| 427 // Given the leftmost word of a line either as a Tesseract unicharset + werd | |
| 428 // or a utf8 string, set the following attributes for it: | |
| 429 // is_list - this word might be a list number or bullet. | |
| 430 // starts_idea - this word is likely to start a sentence. | |
| 431 // ends_idea - this word is likely to end a sentence. | |
| 432 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, | |
| 433 bool *is_list, bool *starts_idea, bool *ends_idea) { | |
| 434 *is_list = false; | |
| 435 *starts_idea = false; | |
| 436 *ends_idea = false; | |
| 437 if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty | |
| 438 *ends_idea = true; | |
| 439 return; | |
| 440 } | |
| 441 | |
| 442 if (unicharset && werd) { // We have a proper werd and unicharset so use it. | |
| 443 if (UniLikelyListItem(unicharset, werd)) { | |
| 444 *is_list = true; | |
| 445 *starts_idea = true; | |
| 446 *ends_idea = true; | |
| 447 } | |
| 448 if (unicharset->get_isupper(werd->unichar_id(0))) { | |
| 449 *starts_idea = true; | |
| 450 } | |
| 451 if (unicharset->get_ispunctuation(werd->unichar_id(0))) { | |
| 452 *starts_idea = true; | |
| 453 *ends_idea = true; | |
| 454 } | |
| 455 } else { // Assume utf8 is mostly ASCII | |
| 456 if (AsciiLikelyListItem(utf8)) { | |
| 457 *is_list = true; | |
| 458 *starts_idea = true; | |
| 459 } | |
| 460 int start_letter = utf8[0]; | |
| 461 if (IsOpeningPunct(start_letter)) { | |
| 462 *starts_idea = true; | |
| 463 } | |
| 464 if (IsTerminalPunct(start_letter)) { | |
| 465 *ends_idea = true; | |
| 466 } | |
| 467 if (start_letter >= 'A' && start_letter <= 'Z') { | |
| 468 *starts_idea = true; | |
| 469 } | |
| 470 } | |
| 471 } | |
| 472 | |
| 473 // Given the rightmost word of a line either as a Tesseract unicharset + werd | |
| 474 // or a utf8 string, set the following attributes for it: | |
| 475 // is_list - this word might be a list number or bullet. | |
| 476 // starts_idea - this word is likely to start a sentence. | |
| 477 // ends_idea - this word is likely to end a sentence. | |
| 478 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, | |
| 479 bool *is_list, bool *starts_idea, bool *ends_idea) { | |
| 480 *is_list = false; | |
| 481 *starts_idea = false; | |
| 482 *ends_idea = false; | |
| 483 if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty | |
| 484 *ends_idea = true; | |
| 485 return; | |
| 486 } | |
| 487 | |
| 488 if (unicharset && werd) { // We have a proper werd and unicharset so use it. | |
| 489 if (UniLikelyListItem(unicharset, werd)) { | |
| 490 *is_list = true; | |
| 491 *starts_idea = true; | |
| 492 } | |
| 493 UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); | |
| 494 if (unicharset->get_ispunctuation(last_letter)) { | |
| 495 *ends_idea = true; | |
| 496 } | |
| 497 } else { // Assume utf8 is mostly ASCII | |
| 498 if (AsciiLikelyListItem(utf8)) { | |
| 499 *is_list = true; | |
| 500 *starts_idea = true; | |
| 501 } | |
| 502 int last_letter = utf8[utf8.size() - 1]; | |
| 503 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { | |
| 504 *ends_idea = true; | |
| 505 } | |
| 506 } | |
| 507 } | |
| 508 | |
| 509 // =============== Implementation of RowScratchRegisters ===================== | |
| 510 /* static */ | |
| 511 void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) { | |
| 512 header.emplace_back("[lmarg,lind;rind,rmarg]"); | |
| 513 header.emplace_back("model"); | |
| 514 } | |
| 515 | |
| 516 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, | |
| 517 std::vector<std::string> &dbg) const { | |
| 518 char s[60]; | |
| 519 // The largest (positive and negative) numbers are reported for lindent & rindent. | |
| 520 // While the column header has widths 5,4,4,5, it is therefore opportune to slightly | |
| 521 // offset the widths in the format string here to allow ample space for lindent & rindent | |
| 522 // while keeping the final table output nicely readable: 4,5,5,4. | |
| 523 snprintf(s, sizeof(s), "[%4d,%5d;%5d,%4d]", lmargin_, lindent_, rindent_, rmargin_); | |
| 524 dbg.emplace_back(s); | |
| 525 std::string model_string; | |
| 526 model_string += static_cast<char>(GetLineType()); | |
| 527 model_string += ":"; | |
| 528 | |
| 529 int model_numbers = 0; | |
| 530 for (const auto &hypothese : hypotheses_) { | |
| 531 if (hypothese.model == nullptr) { | |
| 532 continue; | |
| 533 } | |
| 534 if (model_numbers > 0) { | |
| 535 model_string += ","; | |
| 536 } | |
| 537 if (StrongModel(hypothese.model)) { | |
| 538 model_string += std::to_string(1 + theory.IndexOf(hypothese.model)); | |
| 539 } else if (hypothese.model == kCrownLeft) { | |
| 540 model_string += "CrL"; | |
| 541 } else if (hypothese.model == kCrownRight) { | |
| 542 model_string += "CrR"; | |
| 543 } | |
| 544 model_numbers++; | |
| 545 } | |
| 546 if (model_numbers == 0) { | |
| 547 model_string += "0"; | |
| 548 } | |
| 549 | |
| 550 dbg.push_back(model_string); | |
| 551 } | |
| 552 | |
| 553 void RowScratchRegisters::Init(const RowInfo &row) { | |
| 554 ri_ = &row; | |
| 555 lmargin_ = 0; | |
| 556 lindent_ = row.pix_ldistance; | |
| 557 rmargin_ = 0; | |
| 558 rindent_ = row.pix_rdistance; | |
| 559 } | |
| 560 | |
| 561 LineType RowScratchRegisters::GetLineType() const { | |
| 562 if (hypotheses_.empty()) { | |
| 563 return LT_UNKNOWN; | |
| 564 } | |
| 565 bool has_start = false; | |
| 566 bool has_body = false; | |
| 567 for (const auto &hypothese : hypotheses_) { | |
| 568 switch (hypothese.ty) { | |
| 569 case LT_START: | |
| 570 has_start = true; | |
| 571 break; | |
| 572 case LT_BODY: | |
| 573 has_body = true; | |
| 574 break; | |
| 575 default: | |
| 576 tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty); | |
| 577 break; | |
| 578 } | |
| 579 } | |
| 580 if (has_start && has_body) { | |
| 581 return LT_MULTIPLE; | |
| 582 } | |
| 583 return has_start ? LT_START : LT_BODY; | |
| 584 } | |
| 585 | |
| 586 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { | |
| 587 if (hypotheses_.empty()) { | |
| 588 return LT_UNKNOWN; | |
| 589 } | |
| 590 bool has_start = false; | |
| 591 bool has_body = false; | |
| 592 for (const auto &hypothese : hypotheses_) { | |
| 593 if (hypothese.model != model) { | |
| 594 continue; | |
| 595 } | |
| 596 switch (hypothese.ty) { | |
| 597 case LT_START: | |
| 598 has_start = true; | |
| 599 break; | |
| 600 case LT_BODY: | |
| 601 has_body = true; | |
| 602 break; | |
| 603 default: | |
| 604 tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty); | |
| 605 break; | |
| 606 } | |
| 607 } | |
| 608 if (has_start && has_body) { | |
| 609 return LT_MULTIPLE; | |
| 610 } | |
| 611 return has_start ? LT_START : LT_BODY; | |
| 612 } | |
| 613 | |
| 614 void RowScratchRegisters::SetStartLine() { | |
| 615 LineType current_lt = GetLineType(); | |
| 616 if (current_lt != LT_UNKNOWN && current_lt != LT_START) { | |
| 617 tprintf("Trying to set a line to be START when it's already BODY.\n"); | |
| 618 } | |
| 619 if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { | |
| 620 push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr)); | |
| 621 } | |
| 622 } | |
| 623 | |
| 624 void RowScratchRegisters::SetBodyLine() { | |
| 625 LineType current_lt = GetLineType(); | |
| 626 if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { | |
| 627 tprintf("Trying to set a line to be BODY when it's already START.\n"); | |
| 628 } | |
| 629 if (current_lt == LT_UNKNOWN || current_lt == LT_START) { | |
| 630 push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr)); | |
| 631 } | |
| 632 } | |
| 633 | |
| 634 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { | |
| 635 push_back_new(hypotheses_, LineHypothesis(LT_START, model)); | |
| 636 auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr)); | |
| 637 if (found != hypotheses_.end()) { | |
| 638 hypotheses_.erase(found); | |
| 639 } | |
| 640 } | |
| 641 | |
| 642 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { | |
| 643 push_back_new(hypotheses_, LineHypothesis(LT_BODY, model)); | |
| 644 auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr)); | |
| 645 if (found != hypotheses_.end()) { | |
| 646 hypotheses_.erase(found); | |
| 647 } | |
| 648 } | |
| 649 | |
| 650 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { | |
| 651 for (const auto &hypothese : hypotheses_) { | |
| 652 if (hypothese.ty == LT_START && StrongModel(hypothese.model)) { | |
| 653 push_back_new(*models, hypothese.model); | |
| 654 } | |
| 655 } | |
| 656 } | |
| 657 | |
| 658 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { | |
| 659 for (const auto &hypothese : hypotheses_) { | |
| 660 if (StrongModel(hypothese.model)) { | |
| 661 push_back_new(*models, hypothese.model); | |
| 662 } | |
| 663 } | |
| 664 } | |
| 665 | |
| 666 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { | |
| 667 for (const auto &hypothese : hypotheses_) { | |
| 668 if (hypothese.model != nullptr) { | |
| 669 push_back_new(*models, hypothese.model); | |
| 670 } | |
| 671 } | |
| 672 } | |
| 673 | |
| 674 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { | |
| 675 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) { | |
| 676 return nullptr; | |
| 677 } | |
| 678 return hypotheses_[0].model; | |
| 679 } | |
| 680 | |
| 681 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { | |
| 682 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) { | |
| 683 return nullptr; | |
| 684 } | |
| 685 return hypotheses_[0].model; | |
| 686 } | |
| 687 | |
| 688 // Discard any hypotheses whose model is not in the given list. | |
| 689 void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) { | |
| 690 if (models.empty()) { | |
| 691 return; | |
| 692 } | |
| 693 for (int h = hypotheses_.size() - 1; h >= 0; h--) { | |
| 694 if (!contains(models, hypotheses_[h].model)) { | |
| 695 hypotheses_.erase(hypotheses_.begin() + h); | |
| 696 } | |
| 697 } | |
| 698 } | |
| 699 | |
| 700 // ============ Geometry based Paragraph Detection Algorithm ================= | |
| 701 | |
| 702 struct Cluster { | |
| 703 Cluster() : center(0), count(0) {} | |
| 704 Cluster(int cen, int num) : center(cen), count(num) {} | |
| 705 | |
| 706 int center; // The center of the cluster. | |
| 707 int count; // The number of entries within the cluster. | |
| 708 }; | |
| 709 | |
| 710 class SimpleClusterer { | |
| 711 public: | |
| 712 explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {} | |
| 713 void Add(int value) { | |
| 714 values_.push_back(value); | |
| 715 } | |
| 716 size_t size() const { | |
| 717 return values_.size(); | |
| 718 } | |
| 719 void GetClusters(std::vector<Cluster> *clusters); | |
| 720 | |
| 721 private: | |
| 722 int max_cluster_width_; | |
| 723 std::vector<int> values_; | |
| 724 }; | |
| 725 | |
| 726 // Return the index of the cluster closest to value. | |
| 727 static int ClosestCluster(const std::vector<Cluster> &clusters, int value) { | |
| 728 unsigned best_index = 0; | |
| 729 for (unsigned i = 0; i < clusters.size(); i++) { | |
| 730 if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) { | |
| 731 best_index = i; | |
| 732 } | |
| 733 } | |
| 734 return best_index; | |
| 735 } | |
| 736 | |
| 737 void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) { | |
| 738 clusters->clear(); | |
| 739 std::sort(values_.begin(), values_.end()); | |
| 740 for (unsigned i = 0; i < values_.size();) { | |
| 741 int orig_i = i; | |
| 742 int lo = values_[i]; | |
| 743 int hi = lo; | |
| 744 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { | |
| 745 hi = values_[i]; | |
| 746 } | |
| 747 clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); | |
| 748 } | |
| 749 } | |
| 750 | |
| 751 // Calculate left- and right-indent tab stop values seen in | |
| 752 // rows[row_start, row_end) given a tolerance of tolerance. | |
| 753 static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end, | |
| 754 int tolerance, std::vector<Cluster> *left_tabs, | |
| 755 std::vector<Cluster> *right_tabs) { | |
| 756 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) { | |
| 757 return; | |
| 758 } | |
| 759 // First pass: toss all left and right indents into clusterers. | |
| 760 SimpleClusterer initial_lefts(tolerance); | |
| 761 SimpleClusterer initial_rights(tolerance); | |
| 762 std::vector<Cluster> initial_left_tabs; | |
| 763 std::vector<Cluster> initial_right_tabs; | |
| 764 for (int i = row_start; i < row_end; i++) { | |
| 765 initial_lefts.Add((*rows)[i].lindent_); | |
| 766 initial_rights.Add((*rows)[i].rindent_); | |
| 767 } | |
| 768 initial_lefts.GetClusters(&initial_left_tabs); | |
| 769 initial_rights.GetClusters(&initial_right_tabs); | |
| 770 | |
| 771 // Second pass: cluster only lines that are not "stray" | |
| 772 // An example of a stray line is a page number -- a line whose start | |
| 773 // and end tab-stops are far outside the typical start and end tab-stops | |
| 774 // for the block. | |
| 775 // Put another way, we only cluster data from lines whose start or end | |
| 776 // tab stop is frequent. | |
| 777 SimpleClusterer lefts(tolerance); | |
| 778 SimpleClusterer rights(tolerance); | |
| 779 | |
| 780 // Outlier elimination. We might want to switch this to test outlier-ness | |
| 781 // based on how strange a position an outlier is in instead of or in addition | |
| 782 // to how rare it is. These outliers get re-added if we end up having too | |
| 783 // few tab stops, to work with, however. | |
| 784 int infrequent_enough_to_ignore = 0; | |
| 785 if (row_end - row_start >= 8) { | |
| 786 infrequent_enough_to_ignore = 1; | |
| 787 } | |
| 788 if (row_end - row_start >= 20) { | |
| 789 infrequent_enough_to_ignore = 2; | |
| 790 } | |
| 791 | |
| 792 for (int i = row_start; i < row_end; i++) { | |
| 793 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); | |
| 794 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); | |
| 795 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || | |
| 796 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { | |
| 797 lefts.Add((*rows)[i].lindent_); | |
| 798 rights.Add((*rows)[i].rindent_); | |
| 799 } | |
| 800 } | |
| 801 lefts.GetClusters(left_tabs); | |
| 802 rights.GetClusters(right_tabs); | |
| 803 | |
| 804 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) || | |
| 805 (right_tabs->size() == 1 && left_tabs->size() >= 4)) { | |
| 806 // One side is really ragged, and the other only has one tab stop, | |
| 807 // so those "insignificant outliers" are probably important, actually. | |
| 808 // This often happens on a page of an index. Add back in the ones | |
| 809 // we omitted in the first pass. | |
| 810 for (int i = row_start; i < row_end; i++) { | |
| 811 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); | |
| 812 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); | |
| 813 if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore || | |
| 814 initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) { | |
| 815 lefts.Add((*rows)[i].lindent_); | |
| 816 rights.Add((*rows)[i].rindent_); | |
| 817 } | |
| 818 } | |
| 819 } | |
| 820 lefts.GetClusters(left_tabs); | |
| 821 rights.GetClusters(right_tabs); | |
| 822 | |
| 823 // If one side is almost a two-indent aligned side, and the other clearly | |
| 824 // isn't, try to prune out the least frequent tab stop from that side. | |
| 825 if (left_tabs->size() == 3 && right_tabs->size() >= 4) { | |
| 826 int to_prune = -1; | |
| 827 for (int i = left_tabs->size() - 1; i >= 0; i--) { | |
| 828 if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) { | |
| 829 to_prune = i; | |
| 830 } | |
| 831 } | |
| 832 if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { | |
| 833 left_tabs->erase(left_tabs->begin() + to_prune); | |
| 834 } | |
| 835 } | |
| 836 if (right_tabs->size() == 3 && left_tabs->size() >= 4) { | |
| 837 int to_prune = -1; | |
| 838 for (int i = right_tabs->size() - 1; i >= 0; i--) { | |
| 839 if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) { | |
| 840 to_prune = i; | |
| 841 } | |
| 842 } | |
| 843 if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { | |
| 844 right_tabs->erase(right_tabs->begin() + to_prune); | |
| 845 } | |
| 846 } | |
| 847 } | |
| 848 | |
| 849 // Given a paragraph model mark rows[row_start, row_end) as said model | |
| 850 // start or body lines. | |
| 851 // | |
| 852 // Case 1: model->first_indent_ != model->body_indent_ | |
| 853 // Differentiating the paragraph start lines from the paragraph body lines in | |
| 854 // this case is easy, we just see how far each line is indented. | |
| 855 // | |
| 856 // Case 2: model->first_indent_ == model->body_indent_ | |
| 857 // Here, we find end-of-paragraph lines by looking for "short lines." | |
| 858 // What constitutes a "short line" changes depending on whether the text | |
| 859 // ragged-right[left] or fully justified (aligned left and right). | |
| 860 // | |
| 861 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) | |
| 862 // We have a new paragraph it the first word would have at the end | |
| 863 // of the previous line. | |
| 864 // | |
| 865 // Case 2b: Fully Justified. (eop_threshold > 0) | |
| 866 // We mark a line as short (end of paragraph) if the offside indent | |
| 867 // is greater than eop_threshold. | |
| 868 static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end, | |
| 869 const ParagraphModel *model, bool ltr, int eop_threshold) { | |
| 870 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { | |
| 871 return; | |
| 872 } | |
| 873 for (int row = row_start; row < row_end; row++) { | |
| 874 bool valid_first = ValidFirstLine(rows, row, model); | |
| 875 bool valid_body = ValidBodyLine(rows, row, model); | |
| 876 if (valid_first && !valid_body) { | |
| 877 (*rows)[row].AddStartLine(model); | |
| 878 } else if (valid_body && !valid_first) { | |
| 879 (*rows)[row].AddBodyLine(model); | |
| 880 } else if (valid_body && valid_first) { | |
| 881 bool after_eop = (row == row_start); | |
| 882 if (row > row_start) { | |
| 883 if (eop_threshold > 0) { | |
| 884 if (model->justification() == JUSTIFICATION_LEFT) { | |
| 885 after_eop = (*rows)[row - 1].rindent_ > eop_threshold; | |
| 886 } else { | |
| 887 after_eop = (*rows)[row - 1].lindent_ > eop_threshold; | |
| 888 } | |
| 889 } else { | |
| 890 after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification()); | |
| 891 } | |
| 892 } | |
| 893 if (after_eop) { | |
| 894 (*rows)[row].AddStartLine(model); | |
| 895 } else { | |
| 896 (*rows)[row].AddBodyLine(model); | |
| 897 } | |
| 898 } else { | |
| 899 // Do nothing. Stray row. | |
| 900 } | |
| 901 } | |
| 902 } | |
| 903 | |
| 904 // GeometricClassifierState holds all of the information we'll use while | |
| 905 // trying to determine a paragraph model for the text lines in a block of | |
| 906 // text: | |
| 907 // + the rows under consideration [row_start, row_end) | |
| 908 // + the common left- and right-indent tab stops | |
| 909 // + does the block start out left-to-right or right-to-left | |
| 910 // Further, this struct holds the data we amass for the (single) ParagraphModel | |
| 911 // we'll assign to the text lines (assuming we get that far). | |
| 912 struct GeometricClassifierState { | |
| 913 GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start, | |
| 914 int r_end) | |
| 915 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) { | |
| 916 tolerance = InterwordSpace(*r, r_start, r_end); | |
| 917 CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs); | |
| 918 if (debug_level >= 3) { | |
| 919 tesserr << "Geometry: TabStop cluster tolerance = " << tolerance << "; " | |
| 920 << left_tabs.size() << " left tabs; " | |
| 921 << right_tabs.size() << " right tabs\n"; | |
| 922 } | |
| 923 ltr = (*r)[r_start].ri_->ltr; | |
| 924 } | |
| 925 | |
| 926 void AssumeLeftJustification() { | |
| 927 just = tesseract::JUSTIFICATION_LEFT; | |
| 928 margin = (*rows)[row_start].lmargin_; | |
| 929 } | |
| 930 | |
| 931 void AssumeRightJustification() { | |
| 932 just = tesseract::JUSTIFICATION_RIGHT; | |
| 933 margin = (*rows)[row_start].rmargin_; | |
| 934 } | |
| 935 | |
| 936 // Align tabs are the tab stops the text is aligned to. | |
| 937 const std::vector<Cluster> &AlignTabs() const { | |
| 938 if (just == tesseract::JUSTIFICATION_RIGHT) { | |
| 939 return right_tabs; | |
| 940 } | |
| 941 return left_tabs; | |
| 942 } | |
| 943 | |
| 944 // Offside tabs are the tab stops opposite the tabs used to align the text. | |
| 945 // | |
| 946 // Note that for a left-to-right text which is aligned to the right such as | |
| 947 // this function comment, the offside tabs are the horizontal tab stops | |
| 948 // marking the beginning of ("Note", "this" and "marking"). | |
| 949 const std::vector<Cluster> &OffsideTabs() const { | |
| 950 if (just == tesseract::JUSTIFICATION_RIGHT) { | |
| 951 return left_tabs; | |
| 952 } | |
| 953 return right_tabs; | |
| 954 } | |
| 955 | |
| 956 // Return whether the i'th row extends from the leftmost left tab stop | |
| 957 // to the right most right tab stop. | |
| 958 bool IsFullRow(int i) const { | |
| 959 return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && | |
| 960 ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; | |
| 961 } | |
| 962 | |
| 963 int AlignsideTabIndex(int row_idx) const { | |
| 964 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); | |
| 965 } | |
| 966 | |
| 967 // Given what we know about the paragraph justification (just), would the | |
| 968 // first word of row_b have fit at the end of row_a? | |
| 969 bool FirstWordWouldHaveFit(int row_a, int row_b) { | |
| 970 return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just); | |
| 971 } | |
| 972 | |
| 973 void PrintRows() const { | |
| 974 PrintRowRange(*rows, row_start, row_end); | |
| 975 } | |
| 976 | |
| 977 void Fail(int min_debug_level, const char *why) const { | |
| 978 if (debug_level < min_debug_level) { | |
| 979 return; | |
| 980 } | |
| 981 tprintf("# %s\n", why); | |
| 982 PrintRows(); | |
| 983 } | |
| 984 | |
| 985 ParagraphModel Model() const { | |
| 986 return ParagraphModel(just, margin, first_indent, body_indent, tolerance); | |
| 987 } | |
| 988 | |
| 989 // We print out messages with a debug level at least as great as debug_level. | |
| 990 int debug_level = 0; | |
| 991 | |
| 992 // The Geometric Classifier was asked to find a single paragraph model | |
| 993 // to fit the text rows (*rows)[row_start, row_end) | |
| 994 std::vector<RowScratchRegisters> *rows; | |
| 995 int row_start = 0; | |
| 996 int row_end = 0; | |
| 997 | |
| 998 // The amount by which we expect the text edge can vary and still be aligned. | |
| 999 int tolerance = 0; | |
| 1000 | |
| 1001 // Is the script in this text block left-to-right? | |
| 1002 // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve | |
| 1003 bool ltr = false; | |
| 1004 | |
| 1005 // These left and right tab stops were determined to be the common tab | |
| 1006 // stops for the given text. | |
| 1007 std::vector<Cluster> left_tabs; | |
| 1008 std::vector<Cluster> right_tabs; | |
| 1009 | |
| 1010 // These are parameters we must determine to create a ParagraphModel. | |
| 1011 tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN; | |
| 1012 int margin = 0; | |
| 1013 int first_indent = 0; | |
| 1014 int body_indent = 0; | |
| 1015 | |
| 1016 // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() | |
| 1017 int eop_threshold = 0; | |
| 1018 }; | |
| 1019 | |
| 1020 // Given a section of text where strong textual clues did not help identifying | |
| 1021 // paragraph breaks, and for which the left and right indents have exactly | |
| 1022 // three tab stops between them, attempt to find the paragraph breaks based | |
| 1023 // solely on the outline of the text and whether the script is left-to-right. | |
| 1024 // | |
| 1025 // Algorithm Detail: | |
| 1026 // The selected rows are in the form of a rectangle except | |
| 1027 // for some number of "short lines" of the same length: | |
| 1028 // | |
| 1029 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx | |
| 1030 // xxxxxxxxxxx xxxxxxxxxx # A "short" line. | |
| 1031 // xxxxxxxxxxxxx xxxxxxxxxxxx | |
| 1032 // xxxxxxxxxxxxx xxxxxxxxxxxx | |
| 1033 // | |
| 1034 // We have a slightly different situation if the only short | |
| 1035 // line is at the end of the excerpt. | |
| 1036 // | |
| 1037 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx | |
| 1038 // xxxxxxxxxxxxx xxxxxxxxxxxx | |
| 1039 // xxxxxxxxxxxxx xxxxxxxxxxxx | |
| 1040 // xxxxxxxxxxx xxxxxxxxxx # A "short" line. | |
| 1041 // | |
| 1042 // We'll interpret these as follows based on the reasoning in the comment for | |
| 1043 // GeometricClassify(): | |
| 1044 // [script direction: first indent, body indent] | |
| 1045 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 | |
| 1046 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 | |
| 1047 static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, | |
| 1048 ParagraphTheory *theory) { | |
| 1049 int num_rows = s.row_end - s.row_start; | |
| 1050 int num_full_rows = 0; | |
| 1051 int last_row_full = 0; | |
| 1052 for (int i = s.row_start; i < s.row_end; i++) { | |
| 1053 if (s.IsFullRow(i)) { | |
| 1054 num_full_rows++; | |
| 1055 if (i == s.row_end - 1) { | |
| 1056 last_row_full++; | |
| 1057 } | |
| 1058 } | |
| 1059 } | |
| 1060 | |
| 1061 if (num_full_rows < 0.7 * num_rows) { | |
| 1062 s.Fail(1, "Not enough full lines to know which lines start paras."); | |
| 1063 return; | |
| 1064 } | |
| 1065 | |
| 1066 // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() | |
| 1067 s.eop_threshold = 0; | |
| 1068 | |
| 1069 if (s.ltr) { | |
| 1070 s.AssumeLeftJustification(); | |
| 1071 } else { | |
| 1072 s.AssumeRightJustification(); | |
| 1073 } | |
| 1074 | |
| 1075 if (debug_level > 0) { | |
| 1076 tprintf( | |
| 1077 "# Not enough variety for clear outline classification. " | |
| 1078 "Guessing these are %s aligned based on script.\n", | |
| 1079 s.ltr ? "left" : "right"); | |
| 1080 s.PrintRows(); | |
| 1081 } | |
| 1082 | |
| 1083 if (s.AlignTabs().size() == 2) { // case A1 or A2 | |
| 1084 s.first_indent = s.AlignTabs()[1].center; | |
| 1085 s.body_indent = s.AlignTabs()[0].center; | |
| 1086 } else { // case B1 or B2 | |
| 1087 if (num_rows - 1 == num_full_rows - last_row_full) { | |
| 1088 // case B2 | |
| 1089 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; | |
| 1090 (*s.rows)[s.row_start].AddStartLine(model); | |
| 1091 for (int i = s.row_start + 1; i < s.row_end; i++) { | |
| 1092 (*s.rows)[i].AddBodyLine(model); | |
| 1093 } | |
| 1094 return; | |
| 1095 } else { | |
| 1096 // case B1 | |
| 1097 s.first_indent = s.body_indent = s.AlignTabs()[0].center; | |
| 1098 s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; | |
| 1099 } | |
| 1100 } | |
| 1101 const ParagraphModel *model = theory->AddModel(s.Model()); | |
| 1102 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold); | |
| 1103 return; | |
| 1104 } | |
| 1105 | |
| 1106 // This function is called if strong textual clues were not available, but | |
| 1107 // the caller hopes that the paragraph breaks will be super obvious just | |
| 1108 // by the outline of the text. | |
| 1109 // | |
| 1110 // The particularly difficult case is figuring out what's going on if you | |
| 1111 // don't have enough short paragraph end lines to tell us what's going on. | |
| 1112 // | |
| 1113 // For instance, let's say you have the following outline: | |
| 1114 // | |
| 1115 // (A1) xxxxxxxxxxxxxxxxxxxxxx | |
| 1116 // xxxxxxxxxxxxxxxxxxxx | |
| 1117 // xxxxxxxxxxxxxxxxxxxxxx | |
| 1118 // xxxxxxxxxxxxxxxxxxxxxx | |
| 1119 // | |
| 1120 // Even if we know that the text is left-to-right and so will probably be | |
| 1121 // left-aligned, both of the following are possible texts: | |
| 1122 // | |
| 1123 // (A1a) 1. Here our list item | |
| 1124 // with two full lines. | |
| 1125 // 2. Here a second item. | |
| 1126 // 3. Here our third one. | |
| 1127 // | |
| 1128 // (A1b) so ends paragraph one. | |
| 1129 // Here starts another | |
| 1130 // paragraph we want to | |
| 1131 // read. This continues | |
| 1132 // | |
| 1133 // These examples are obvious from the text and should have been caught | |
| 1134 // by the StrongEvidenceClassify pass. However, for languages where we don't | |
| 1135 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), | |
| 1136 // it's worth guessing that (A1b) is the correct interpretation if there are | |
| 1137 // far more "full" lines than "short" lines. | |
| 1138 static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows, | |
| 1139 int row_start, int row_end, ParagraphTheory *theory) { | |
| 1140 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) { | |
| 1141 return; | |
| 1142 } | |
| 1143 if (debug_level > 1) { | |
| 1144 tprintf("###############################################\n"); | |
| 1145 tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end); | |
| 1146 tprintf("###############################################\n"); | |
| 1147 } | |
| 1148 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); | |
| 1149 | |
| 1150 GeometricClassifierState s(debug_level, rows, row_start, row_end); | |
| 1151 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { | |
| 1152 s.Fail(2, "Too much variety for simple outline classification."); | |
| 1153 return; | |
| 1154 } | |
| 1155 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { | |
| 1156 s.Fail(1, "Not enough variety for simple outline classification."); | |
| 1157 return; | |
| 1158 } | |
| 1159 if (s.left_tabs.size() + s.right_tabs.size() == 3) { | |
| 1160 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); | |
| 1161 return; | |
| 1162 } | |
| 1163 | |
| 1164 // At this point, we know that one side has at least two tab stops, and the | |
| 1165 // other side has one or two tab stops. | |
| 1166 // Left to determine: | |
| 1167 // (1) Which is the body indent and which is the first line indent? | |
| 1168 // (2) Is the text fully justified? | |
| 1169 | |
| 1170 // If one side happens to have three or more tab stops, assume that side | |
| 1171 // is opposite of the aligned side. | |
| 1172 if (s.right_tabs.size() > 2) { | |
| 1173 s.AssumeLeftJustification(); | |
| 1174 } else if (s.left_tabs.size() > 2) { | |
| 1175 s.AssumeRightJustification(); | |
| 1176 } else if (s.ltr) { // guess based on script direction | |
| 1177 s.AssumeLeftJustification(); | |
| 1178 } else { | |
| 1179 s.AssumeRightJustification(); | |
| 1180 } | |
| 1181 | |
| 1182 if (s.AlignTabs().size() == 2) { | |
| 1183 // For each tab stop on the aligned side, how many of them appear | |
| 1184 // to be paragraph start lines? [first lines] | |
| 1185 int firsts[2] = {0, 0}; | |
| 1186 // Count the first line as a likely paragraph start line. | |
| 1187 firsts[s.AlignsideTabIndex(s.row_start)]++; | |
| 1188 // For each line, if the first word would have fit on the previous | |
| 1189 // line count it as a likely paragraph start line. | |
| 1190 bool jam_packed = true; | |
| 1191 for (int i = s.row_start + 1; i < s.row_end; i++) { | |
| 1192 if (s.FirstWordWouldHaveFit(i - 1, i)) { | |
| 1193 firsts[s.AlignsideTabIndex(i)]++; | |
| 1194 jam_packed = false; | |
| 1195 } | |
| 1196 } | |
| 1197 // Make an extra accounting for the last line of the paragraph just | |
| 1198 // in case it's the only short line in the block. That is, take its | |
| 1199 // first word as typical and see if this looks like the *last* line | |
| 1200 // of a paragraph. If so, mark the *other* indent as probably a first. | |
| 1201 if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { | |
| 1202 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; | |
| 1203 } | |
| 1204 | |
| 1205 int percent0firsts, percent1firsts; | |
| 1206 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; | |
| 1207 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; | |
| 1208 | |
| 1209 // TODO(eger): Tune these constants if necessary. | |
| 1210 if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) { | |
| 1211 s.first_indent = s.AlignTabs()[1].center; | |
| 1212 s.body_indent = s.AlignTabs()[0].center; | |
| 1213 } else if ((percent1firsts < 20 && 30 < percent0firsts) || | |
| 1214 percent1firsts + 30 < percent0firsts) { | |
| 1215 s.first_indent = s.AlignTabs()[0].center; | |
| 1216 s.body_indent = s.AlignTabs()[1].center; | |
| 1217 } else { | |
| 1218 // Ambiguous! Probably lineated (poetry) | |
| 1219 if (debug_level > 1) { | |
| 1220 tprintf("# Cannot determine %s indent likely to start paragraphs.\n", | |
| 1221 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); | |
| 1222 tprintf("# Indent of %d looks like a first line %d%% of the time.\n", | |
| 1223 s.AlignTabs()[0].center, percent0firsts); | |
| 1224 tprintf("# Indent of %d looks like a first line %d%% of the time.\n", | |
| 1225 s.AlignTabs()[1].center, percent1firsts); | |
| 1226 s.PrintRows(); | |
| 1227 } | |
| 1228 return; | |
| 1229 } | |
| 1230 } else { | |
| 1231 // There's only one tab stop for the "aligned to" side. | |
| 1232 s.first_indent = s.body_indent = s.AlignTabs()[0].center; | |
| 1233 } | |
| 1234 | |
| 1235 // At this point, we have our model. | |
| 1236 const ParagraphModel *model = theory->AddModel(s.Model()); | |
| 1237 | |
| 1238 // Now all we have to do is figure out if the text is fully justified or not. | |
| 1239 // eop_threshold: default to fully justified unless we see evidence below. | |
| 1240 // See description on MarkRowsWithModel() | |
| 1241 s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; | |
| 1242 // If the text is not fully justified, re-set the eop_threshold to 0. | |
| 1243 if (s.AlignTabs().size() == 2) { | |
| 1244 // Paragraphs with a paragraph-start indent. | |
| 1245 for (int i = s.row_start; i < s.row_end - 1; i++) { | |
| 1246 if (ValidFirstLine(s.rows, i + 1, model) && | |
| 1247 !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just), | |
| 1248 s.tolerance)) { | |
| 1249 // We found a non-end-of-paragraph short line: not fully justified. | |
| 1250 s.eop_threshold = 0; | |
| 1251 break; | |
| 1252 } | |
| 1253 } | |
| 1254 } else { | |
| 1255 // Paragraphs with no paragraph-start indent. | |
| 1256 for (int i = s.row_start; i < s.row_end - 1; i++) { | |
| 1257 if (!s.FirstWordWouldHaveFit(i, i + 1) && | |
| 1258 !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just), | |
| 1259 s.tolerance)) { | |
| 1260 // We found a non-end-of-paragraph short line: not fully justified. | |
| 1261 s.eop_threshold = 0; | |
| 1262 break; | |
| 1263 } | |
| 1264 } | |
| 1265 } | |
| 1266 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); | |
| 1267 } | |
| 1268 | |
| 1269 // =============== Implementation of ParagraphTheory ===================== | |
| 1270 | |
| 1271 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { | |
| 1272 for (const auto &m : *models_) { | |
| 1273 if (m->Comparable(model)) { | |
| 1274 return m; | |
| 1275 } | |
| 1276 } | |
| 1277 auto *m = new ParagraphModel(model); | |
| 1278 models_->push_back(m); | |
| 1279 push_back_new(models_we_added_, m); | |
| 1280 return m; | |
| 1281 } | |
| 1282 | |
| 1283 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { | |
| 1284 size_t w = 0; | |
| 1285 for (size_t r = 0; r < models_->size(); r++) { | |
| 1286 ParagraphModel *m = (*models_)[r]; | |
| 1287 if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) { | |
| 1288 delete m; | |
| 1289 } else { | |
| 1290 if (r > w) { | |
| 1291 (*models_)[w] = m; | |
| 1292 } | |
| 1293 w++; | |
| 1294 } | |
| 1295 } | |
| 1296 models_->resize(w); | |
| 1297 } | |
| 1298 | |
| 1299 // Examine rows[start, end) and try to determine if an existing non-centered | |
| 1300 // paragraph model would fit them perfectly. If so, return a pointer to it. | |
| 1301 // If not, return nullptr. | |
| 1302 const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows, | |
| 1303 int start, int end) const { | |
| 1304 for (const auto *model : *models_) { | |
| 1305 if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) { | |
| 1306 return model; | |
| 1307 } | |
| 1308 } | |
| 1309 return nullptr; | |
| 1310 } | |
| 1311 | |
| 1312 void ParagraphTheory::NonCenteredModels(SetOfModels *models) { | |
| 1313 for (const auto *model : *models_) { | |
| 1314 if (model->justification() != JUSTIFICATION_CENTER) { | |
| 1315 push_back_new(*models, model); | |
| 1316 } | |
| 1317 } | |
| 1318 } | |
| 1319 | |
| 1320 int ParagraphTheory::IndexOf(const ParagraphModel *model) const { | |
| 1321 int i = 0; | |
| 1322 for (const auto *m : *models_) { | |
| 1323 if (m == model) { | |
| 1324 return i; | |
| 1325 } | |
| 1326 i++; | |
| 1327 } | |
| 1328 return -1; | |
| 1329 } | |
| 1330 | |
| 1331 bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row, | |
| 1332 const ParagraphModel *model) { | |
| 1333 if (!StrongModel(model)) { | |
| 1334 tprintf("ValidFirstLine() should only be called with strong models!\n"); | |
| 1335 } | |
| 1336 return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_, | |
| 1337 (*rows)[row].rindent_, (*rows)[row].rmargin_); | |
| 1338 } | |
| 1339 | |
| 1340 bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row, | |
| 1341 const ParagraphModel *model) { | |
| 1342 if (!StrongModel(model)) { | |
| 1343 tprintf("ValidBodyLine() should only be called with strong models!\n"); | |
| 1344 } | |
| 1345 return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_, | |
| 1346 (*rows)[row].rindent_, (*rows)[row].rmargin_); | |
| 1347 } | |
| 1348 | |
| 1349 bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b, | |
| 1350 const ParagraphModel *model) { | |
| 1351 if (model != kCrownRight && model != kCrownLeft) { | |
| 1352 tprintf("CrownCompatible() should only be called with crown models!\n"); | |
| 1353 return false; | |
| 1354 } | |
| 1355 auto &row_a = (*rows)[a]; | |
| 1356 auto &row_b = (*rows)[b]; | |
| 1357 if (model == kCrownRight) { | |
| 1358 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_, | |
| 1359 Epsilon(row_a.ri_->average_interword_space)); | |
| 1360 } | |
| 1361 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_, | |
| 1362 Epsilon(row_a.ri_->average_interword_space)); | |
| 1363 } | |
| 1364 | |
| 1365 // =============== Implementation of ParagraphModelSmearer ==================== | |
| 1366 | |
| 1367 ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, | |
| 1368 int row_start, int row_end, ParagraphTheory *theory) | |
| 1369 : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) { | |
| 1370 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { | |
| 1371 row_start_ = 0; | |
| 1372 row_end_ = 0; | |
| 1373 return; | |
| 1374 } | |
| 1375 open_models_.resize(open_models_.size() + row_end - row_start + 2); | |
| 1376 } | |
| 1377 | |
| 1378 // see paragraphs_internal.h | |
| 1379 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { | |
| 1380 SetOfModels no_models; | |
| 1381 if (row_start < row_start_) { | |
| 1382 row_start = row_start_; | |
| 1383 } | |
| 1384 if (row_end > row_end_) { | |
| 1385 row_end = row_end_; | |
| 1386 } | |
| 1387 | |
| 1388 for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) { | |
| 1389 if ((*rows_)[row].ri_->num_words == 0) { | |
| 1390 OpenModels(row + 1) = no_models; | |
| 1391 } else { | |
| 1392 SetOfModels &opened = OpenModels(row); | |
| 1393 (*rows_)[row].StartHypotheses(&opened); | |
| 1394 | |
| 1395 // Which models survive the transition from row to row + 1? | |
| 1396 SetOfModels still_open; | |
| 1397 for (auto &m : opened) { | |
| 1398 if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) { | |
| 1399 // This is basic filtering; we check likely paragraph starty-ness down | |
| 1400 // below in Smear() -- you know, whether the first word would have fit | |
| 1401 // and such. | |
| 1402 push_back_new(still_open, m); | |
| 1403 } | |
| 1404 } | |
| 1405 OpenModels(row + 1) = std::move(still_open); | |
| 1406 } | |
| 1407 } | |
| 1408 } | |
| 1409 | |
| 1410 // see paragraphs_internal.h | |
| 1411 void ParagraphModelSmearer::Smear() { | |
| 1412 CalculateOpenModels(row_start_, row_end_); | |
| 1413 | |
| 1414 // For each row which we're unsure about (that is, it is LT_UNKNOWN or | |
| 1415 // we have multiple LT_START hypotheses), see if there's a model that | |
| 1416 // was recently used (an "open" model) which might model it well. | |
| 1417 for (int i = row_start_; i < row_end_; i++) { | |
| 1418 RowScratchRegisters &row = (*rows_)[i]; | |
| 1419 if (row.ri_->num_words == 0) { | |
| 1420 continue; | |
| 1421 } | |
| 1422 | |
| 1423 // Step One: | |
| 1424 // Figure out if there are "open" models which are left-alined or | |
| 1425 // right-aligned. This is important for determining whether the | |
| 1426 // "first" word in a row would fit at the "end" of the previous row. | |
| 1427 bool left_align_open = false; | |
| 1428 bool right_align_open = false; | |
| 1429 for (auto &m : OpenModels(i)) { | |
| 1430 switch (m->justification()) { | |
| 1431 case JUSTIFICATION_LEFT: | |
| 1432 left_align_open = true; | |
| 1433 break; | |
| 1434 case JUSTIFICATION_RIGHT: | |
| 1435 right_align_open = true; | |
| 1436 break; | |
| 1437 default: | |
| 1438 left_align_open = right_align_open = true; | |
| 1439 } | |
| 1440 } | |
| 1441 // Step Two: | |
| 1442 // Use that knowledge to figure out if this row is likely to | |
| 1443 // start a paragraph. | |
| 1444 bool likely_start; | |
| 1445 if (i == 0) { | |
| 1446 likely_start = true; | |
| 1447 } else { | |
| 1448 if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) { | |
| 1449 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) || | |
| 1450 LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT); | |
| 1451 } else if (left_align_open) { | |
| 1452 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT); | |
| 1453 } else { | |
| 1454 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT); | |
| 1455 } | |
| 1456 } | |
| 1457 | |
| 1458 // Step Three: | |
| 1459 // If this text line seems like an obvious first line of an | |
| 1460 // open model, or an obvious continuation of an existing | |
| 1461 // modelled paragraph, mark it up. | |
| 1462 if (likely_start) { | |
| 1463 // Add Start Hypotheses for all Open models that fit. | |
| 1464 for (unsigned m = 0; m < OpenModels(i).size(); m++) { | |
| 1465 if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { | |
| 1466 row.AddStartLine(OpenModels(i)[m]); | |
| 1467 } | |
| 1468 } | |
| 1469 } else { | |
| 1470 // Add relevant body line hypotheses. | |
| 1471 SetOfModels last_line_models; | |
| 1472 if (i > 0) { | |
| 1473 (*rows_)[i - 1].StrongHypotheses(&last_line_models); | |
| 1474 } else { | |
| 1475 theory_->NonCenteredModels(&last_line_models); | |
| 1476 } | |
| 1477 for (auto model : last_line_models) { | |
| 1478 if (ValidBodyLine(rows_, i, model)) { | |
| 1479 row.AddBodyLine(model); | |
| 1480 } | |
| 1481 } | |
| 1482 } | |
| 1483 | |
| 1484 // Step Four: | |
| 1485 // If we're still quite unsure about this line, go through all | |
| 1486 // models in our theory and see if this row could be the start | |
| 1487 // of any of our models. | |
| 1488 if (row.GetLineType() == LT_UNKNOWN || | |
| 1489 (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { | |
| 1490 SetOfModels all_models; | |
| 1491 theory_->NonCenteredModels(&all_models); | |
| 1492 for (auto &all_model : all_models) { | |
| 1493 if (ValidFirstLine(rows_, i, all_model)) { | |
| 1494 row.AddStartLine(all_model); | |
| 1495 } | |
| 1496 } | |
| 1497 } | |
| 1498 // Step Five: | |
| 1499 // Since we may have updated the hypotheses about this row, we need | |
| 1500 // to recalculate the Open models for the rest of rows[i + 1, row_end) | |
| 1501 if (row.GetLineType() != LT_UNKNOWN) { | |
| 1502 CalculateOpenModels(i + 1, row_end_); | |
| 1503 } | |
| 1504 } | |
| 1505 } | |
| 1506 | |
| 1507 // ================ Main Paragraph Detection Algorithm ======================= | |
| 1508 | |
| 1509 // Find out what ParagraphModels are actually used, and discard any | |
| 1510 // that are not. | |
| 1511 static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows, | |
| 1512 ParagraphTheory *theory) { | |
| 1513 SetOfModels used_models; | |
| 1514 for (const auto &row : rows) { | |
| 1515 row.StrongHypotheses(&used_models); | |
| 1516 } | |
| 1517 theory->DiscardUnusedModels(used_models); | |
| 1518 } | |
| 1519 | |
| 1520 // DowngradeWeakestToCrowns: | |
| 1521 // Forget any flush-{left, right} models unless we see two or more | |
| 1522 // of them in sequence. | |
| 1523 // | |
| 1524 // In pass 3, we start to classify even flush-left paragraphs (paragraphs | |
| 1525 // where the first line and body indent are the same) as having proper Models. | |
| 1526 // This is generally dangerous, since if you start imagining that flush-left | |
| 1527 // is a typical paragraph model when it is not, it will lead you to chop normal | |
| 1528 // indented paragraphs in the middle whenever a sentence happens to start on a | |
| 1529 // new line (see "This" above). What to do? | |
| 1530 // What we do is to take any paragraph which is flush left and is not | |
| 1531 // preceded by another paragraph of the same model and convert it to a "Crown" | |
| 1532 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder | |
| 1533 // for later. It means that the paragraph is flush, but it would be desirable | |
| 1534 // to mark it as the same model as following text if it fits. This downgrade | |
| 1535 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we | |
| 1536 // avoid making flush left Paragraph Models whenever we see a top-of-the-page | |
| 1537 // half-of-a-paragraph. and instead we mark it the same as normal body text. | |
| 1538 // | |
| 1539 // Implementation: | |
| 1540 // | |
| 1541 // Comb backwards through the row scratch registers, and turn any | |
| 1542 // sequences of body lines of equivalent type abutted against the beginning | |
| 1543 // or a body or start line of a different type into a crown paragraph. | |
| 1544 static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, | |
| 1545 std::vector<RowScratchRegisters> *rows) { | |
| 1546 int start; | |
| 1547 for (int end = rows->size(); end > 0; end = start) { | |
| 1548 // Search back for a body line of a unique type. | |
| 1549 const ParagraphModel *model = nullptr; | |
| 1550 while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) { | |
| 1551 end--; | |
| 1552 } | |
| 1553 if (end == 0) { | |
| 1554 break; | |
| 1555 } | |
| 1556 start = end - 1; | |
| 1557 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { | |
| 1558 start--; // walk back to the first line that is not the same body type. | |
| 1559 } | |
| 1560 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) && | |
| 1561 NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) { | |
| 1562 start--; | |
| 1563 } | |
| 1564 start++; | |
| 1565 // Now rows[start, end) is a sequence of unique body hypotheses of model. | |
| 1566 if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) { | |
| 1567 continue; | |
| 1568 } | |
| 1569 if (!StrongModel(model)) { | |
| 1570 while (start > 0 && CrownCompatible(rows, start - 1, start, model)) { | |
| 1571 start--; | |
| 1572 } | |
| 1573 } | |
| 1574 if (start == 0 || (!StrongModel(model)) || | |
| 1575 (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { | |
| 1576 // crownify rows[start, end) | |
| 1577 const ParagraphModel *crown_model = model; | |
| 1578 if (StrongModel(model)) { | |
| 1579 if (model->justification() == JUSTIFICATION_LEFT) { | |
| 1580 crown_model = kCrownLeft; | |
| 1581 } else { | |
| 1582 crown_model = kCrownRight; | |
| 1583 } | |
| 1584 } | |
| 1585 (*rows)[start].SetUnknown(); | |
| 1586 (*rows)[start].AddStartLine(crown_model); | |
| 1587 for (int row = start + 1; row < end; row++) { | |
| 1588 (*rows)[row].SetUnknown(); | |
| 1589 (*rows)[row].AddBodyLine(crown_model); | |
| 1590 } | |
| 1591 } | |
| 1592 } | |
| 1593 DiscardUnusedModels(*rows, theory); | |
| 1594 } | |
| 1595 | |
| 1596 // Clear all hypotheses about lines [start, end) and reset margins. | |
| 1597 // | |
| 1598 // The empty space between the left of a row and the block boundary (and | |
| 1599 // similarly for the right) is split into two pieces: margin and indent. | |
| 1600 // In initial processing, we assume the block is tight and the margin for | |
| 1601 // all lines is set to zero. However, if our first pass does not yield | |
| 1602 // models for everything, it may be due to an inset paragraph like a | |
| 1603 // block-quote. In that case, we make a second pass over that unmarked | |
| 1604 // section of the page and reset the "margin" portion of the empty space | |
| 1605 // to the common amount of space at the ends of the lines under consid- | |
| 1606 // eration. This would be equivalent to percentile set to 0. However, | |
| 1607 // sometimes we have a single character sticking out in the right margin | |
| 1608 // of a text block (like the 'r' in 'for' on line 3 above), and we can | |
| 1609 // really just ignore it as an outlier. To express this, we allow the | |
| 1610 // user to specify the percentile (0..100) of indent values to use as | |
| 1611 // the common margin for each row in the run of rows[start, end). | |
| 1612 void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start, | |
| 1613 int end, int percentile) { | |
| 1614 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) { | |
| 1615 return; | |
| 1616 } | |
| 1617 | |
| 1618 int lmin, lmax, rmin, rmax; | |
| 1619 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; | |
| 1620 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; | |
| 1621 for (int i = start; i < end; i++) { | |
| 1622 RowScratchRegisters &sr = (*rows)[i]; | |
| 1623 sr.SetUnknown(); | |
| 1624 if (sr.ri_->num_words == 0) { | |
| 1625 continue; | |
| 1626 } | |
| 1627 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); | |
| 1628 UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); | |
| 1629 } | |
| 1630 STATS lefts(lmin, lmax); | |
| 1631 STATS rights(rmin, rmax); | |
| 1632 for (int i = start; i < end; i++) { | |
| 1633 RowScratchRegisters &sr = (*rows)[i]; | |
| 1634 if (sr.ri_->num_words == 0) { | |
| 1635 continue; | |
| 1636 } | |
| 1637 lefts.add(sr.lmargin_ + sr.lindent_, 1); | |
| 1638 rights.add(sr.rmargin_ + sr.rindent_, 1); | |
| 1639 } | |
| 1640 int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); | |
| 1641 int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); | |
| 1642 for (int i = start; i < end; i++) { | |
| 1643 RowScratchRegisters &sr = (*rows)[i]; | |
| 1644 int ldelta = ignorable_left - sr.lmargin_; | |
| 1645 sr.lmargin_ += ldelta; | |
| 1646 sr.lindent_ -= ldelta; | |
| 1647 int rdelta = ignorable_right - sr.rmargin_; | |
| 1648 sr.rmargin_ += rdelta; | |
| 1649 sr.rindent_ -= rdelta; | |
| 1650 } | |
| 1651 } | |
| 1652 | |
| 1653 // Return the median inter-word space in rows[row_start, row_end). | |
| 1654 int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) { | |
| 1655 if (row_end < row_start + 1) { | |
| 1656 return 1; | |
| 1657 } | |
| 1658 int word_height = | |
| 1659 (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2; | |
| 1660 int word_width = | |
| 1661 (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2; | |
| 1662 STATS spacing_widths(0, 4 + word_width); | |
| 1663 for (int i = row_start; i < row_end; i++) { | |
| 1664 if (rows[i].ri_->num_words > 1) { | |
| 1665 spacing_widths.add(rows[i].ri_->average_interword_space, 1); | |
| 1666 } | |
| 1667 } | |
| 1668 int minimum_reasonable_space = word_height / 3; | |
| 1669 if (minimum_reasonable_space < 2) { | |
| 1670 minimum_reasonable_space = 2; | |
| 1671 } | |
| 1672 int median = spacing_widths.median(); | |
| 1673 return (median > minimum_reasonable_space) ? median : minimum_reasonable_space; | |
| 1674 } | |
| 1675 | |
| 1676 // Return whether the first word on the after line can fit in the space at | |
| 1677 // the end of the before line (knowing which way the text is aligned and read). | |
| 1678 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, | |
| 1679 tesseract::ParagraphJustification justification) { | |
| 1680 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) { | |
| 1681 return true; | |
| 1682 } | |
| 1683 | |
| 1684 if (justification == JUSTIFICATION_UNKNOWN) { | |
| 1685 tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); | |
| 1686 } | |
| 1687 int available_space; | |
| 1688 if (justification == JUSTIFICATION_CENTER) { | |
| 1689 available_space = before.lindent_ + before.rindent_; | |
| 1690 } else { | |
| 1691 available_space = before.OffsideIndent(justification); | |
| 1692 } | |
| 1693 available_space -= before.ri_->average_interword_space; | |
| 1694 | |
| 1695 if (before.ri_->ltr) { | |
| 1696 return after.ri_->lword_box.width() < available_space; | |
| 1697 } | |
| 1698 return after.ri_->rword_box.width() < available_space; | |
| 1699 } | |
| 1700 | |
| 1701 // Return whether the first word on the after line can fit in the space at | |
| 1702 // the end of the before line (not knowing which way the text goes) in a left | |
| 1703 // or right alignment. | |
| 1704 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) { | |
| 1705 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) { | |
| 1706 return true; | |
| 1707 } | |
| 1708 | |
| 1709 int available_space = before.lindent_; | |
| 1710 if (before.rindent_ > available_space) { | |
| 1711 available_space = before.rindent_; | |
| 1712 } | |
| 1713 available_space -= before.ri_->average_interword_space; | |
| 1714 | |
| 1715 if (before.ri_->ltr) { | |
| 1716 return after.ri_->lword_box.width() < available_space; | |
| 1717 } | |
| 1718 return after.ri_->rword_box.width() < available_space; | |
| 1719 } | |
| 1720 | |
| 1721 static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) { | |
| 1722 if (before.ri_->ltr) { | |
| 1723 return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea; | |
| 1724 } else { | |
| 1725 return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea; | |
| 1726 } | |
| 1727 } | |
| 1728 | |
| 1729 static bool LikelyParagraphStart(const RowScratchRegisters &before, | |
| 1730 const RowScratchRegisters &after, | |
| 1731 tesseract::ParagraphJustification j) { | |
| 1732 return before.ri_->num_words == 0 || | |
| 1733 (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after)); | |
| 1734 } | |
| 1735 | |
| 1736 // Examine rows[start, end) and try to determine what sort of ParagraphModel | |
| 1737 // would fit them as a single paragraph. | |
| 1738 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. | |
| 1739 // If the rows given could be a consistent start to a paragraph, set *consistent | |
| 1740 // true. | |
| 1741 static ParagraphModel InternalParagraphModelByOutline( | |
| 1742 const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance, | |
| 1743 bool *consistent) { | |
| 1744 int ltr_line_count = 0; | |
| 1745 for (int i = start; i < end; i++) { | |
| 1746 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr); | |
| 1747 } | |
| 1748 bool ltr = (ltr_line_count >= (end - start) / 2); | |
| 1749 | |
| 1750 *consistent = true; | |
| 1751 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) { | |
| 1752 return ParagraphModel(); | |
| 1753 } | |
| 1754 | |
| 1755 // Ensure the caller only passed us a region with a common rmargin and | |
| 1756 // lmargin. | |
| 1757 int lmargin = (*rows)[start].lmargin_; | |
| 1758 int rmargin = (*rows)[start].rmargin_; | |
| 1759 int lmin, lmax, rmin, rmax, cmin, cmax; | |
| 1760 lmin = lmax = (*rows)[start + 1].lindent_; | |
| 1761 rmin = rmax = (*rows)[start + 1].rindent_; | |
| 1762 cmin = cmax = 0; | |
| 1763 for (int i = start + 1; i < end; i++) { | |
| 1764 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { | |
| 1765 tprintf("Margins don't match! Software error.\n"); | |
| 1766 *consistent = false; | |
| 1767 return ParagraphModel(); | |
| 1768 } | |
| 1769 UpdateRange((*rows)[i].lindent_, &lmin, &lmax); | |
| 1770 UpdateRange((*rows)[i].rindent_, &rmin, &rmax); | |
| 1771 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); | |
| 1772 } | |
| 1773 int ldiff = lmax - lmin; | |
| 1774 int rdiff = rmax - rmin; | |
| 1775 int cdiff = cmax - cmin; | |
| 1776 if (rdiff > tolerance && ldiff > tolerance) { | |
| 1777 if (cdiff < tolerance * 2) { | |
| 1778 if (end - start < 3) { | |
| 1779 return ParagraphModel(); | |
| 1780 } | |
| 1781 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); | |
| 1782 } | |
| 1783 *consistent = false; | |
| 1784 return ParagraphModel(); | |
| 1785 } | |
| 1786 if (end - start < 3) { // Don't return a model for two line paras. | |
| 1787 return ParagraphModel(); | |
| 1788 } | |
| 1789 | |
| 1790 // These booleans keep us from saying something is aligned left when the body | |
| 1791 // left variance is too large. | |
| 1792 bool body_admits_left_alignment = ldiff < tolerance; | |
| 1793 bool body_admits_right_alignment = rdiff < tolerance; | |
| 1794 | |
| 1795 ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, | |
| 1796 (lmin + lmax) / 2, tolerance); | |
| 1797 ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, | |
| 1798 (rmin + rmax) / 2, tolerance); | |
| 1799 | |
| 1800 // These booleans keep us from having an indent on the "wrong side" for the | |
| 1801 // first line. | |
| 1802 bool text_admits_left_alignment = ltr || left_model.is_flush(); | |
| 1803 bool text_admits_right_alignment = !ltr || right_model.is_flush(); | |
| 1804 | |
| 1805 // At least one of the edges is less than tolerance in variance. | |
| 1806 // If the other is obviously ragged, it can't be the one aligned to. | |
| 1807 // [Note the last line is included in this raggedness.] | |
| 1808 if (tolerance < rdiff) { | |
| 1809 if (body_admits_left_alignment && text_admits_left_alignment) { | |
| 1810 return left_model; | |
| 1811 } | |
| 1812 *consistent = false; | |
| 1813 return ParagraphModel(); | |
| 1814 } | |
| 1815 if (tolerance < ldiff) { | |
| 1816 if (body_admits_right_alignment && text_admits_right_alignment) { | |
| 1817 return right_model; | |
| 1818 } | |
| 1819 *consistent = false; | |
| 1820 return ParagraphModel(); | |
| 1821 } | |
| 1822 | |
| 1823 // At this point, we know the body text doesn't vary much on either side. | |
| 1824 | |
| 1825 // If the first line juts out oddly in one direction or the other, | |
| 1826 // that likely indicates the side aligned to. | |
| 1827 int first_left = (*rows)[start].lindent_; | |
| 1828 int first_right = (*rows)[start].rindent_; | |
| 1829 | |
| 1830 if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) { | |
| 1831 return left_model; | |
| 1832 } | |
| 1833 if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) { | |
| 1834 return right_model; | |
| 1835 } | |
| 1836 | |
| 1837 *consistent = false; | |
| 1838 return ParagraphModel(); | |
| 1839 } | |
| 1840 | |
| 1841 // Examine rows[start, end) and try to determine what sort of ParagraphModel | |
| 1842 // would fit them as a single paragraph. If nothing fits, | |
| 1843 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug | |
| 1844 // output if we're debugging. | |
| 1845 static ParagraphModel ParagraphModelByOutline(int debug_level, | |
| 1846 const std::vector<RowScratchRegisters> *rows, | |
| 1847 int start, int end, int tolerance) { | |
| 1848 bool unused_consistent; | |
| 1849 ParagraphModel retval = | |
| 1850 InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent); | |
| 1851 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { | |
| 1852 tprintf("Could not determine a model for this paragraph:\n"); | |
| 1853 PrintRowRange(*rows, start, end); | |
| 1854 } | |
| 1855 return retval; | |
| 1856 } | |
| 1857 | |
| 1858 // Do rows[start, end) form a single instance of the given paragraph model? | |
| 1859 bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end, | |
| 1860 const ParagraphModel *model) { | |
| 1861 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) { | |
| 1862 return false; | |
| 1863 } | |
| 1864 if (!ValidFirstLine(rows, start, model)) { | |
| 1865 return false; | |
| 1866 } | |
| 1867 for (int i = start + 1; i < end; i++) { | |
| 1868 if (!ValidBodyLine(rows, i, model)) { | |
| 1869 return false; | |
| 1870 } | |
| 1871 } | |
| 1872 return true; | |
| 1873 } | |
| 1874 | |
| 1875 // Examine rows[row_start, row_end) as an independent section of text, | |
| 1876 // and mark rows that are exceptionally clear as start-of-paragraph | |
| 1877 // and paragraph-body lines. | |
| 1878 // | |
| 1879 // We presume that any lines surrounding rows[row_start, row_end) may | |
| 1880 // have wildly different paragraph models, so we don't key any data off | |
| 1881 // of those lines. | |
| 1882 // | |
| 1883 // We only take the very strongest signals, as we don't want to get | |
| 1884 // confused and marking up centered text, poetry, or source code as | |
| 1885 // clearly part of a typical paragraph. | |
| 1886 static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start, | |
| 1887 int row_end) { | |
| 1888 // Record patently obvious body text. | |
| 1889 for (int i = row_start + 1; i < row_end; i++) { | |
| 1890 const RowScratchRegisters &prev = (*rows)[i - 1]; | |
| 1891 RowScratchRegisters &curr = (*rows)[i]; | |
| 1892 tesseract::ParagraphJustification typical_justification = | |
| 1893 prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; | |
| 1894 if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea && | |
| 1895 !FirstWordWouldHaveFit(prev, curr, typical_justification)) { | |
| 1896 curr.SetBodyLine(); | |
| 1897 } | |
| 1898 } | |
| 1899 | |
| 1900 // Record patently obvious start paragraph lines. | |
| 1901 // | |
| 1902 // It's an extremely good signal of the start of a paragraph that | |
| 1903 // the first word would have fit on the end of the previous line. | |
| 1904 // However, applying just that signal would have us mark random | |
| 1905 // start lines of lineated text (poetry and source code) and some | |
| 1906 // centered headings as paragraph start lines. Therefore, we use | |
| 1907 // a second qualification for a paragraph start: Not only should | |
| 1908 // the first word of this line have fit on the previous line, | |
| 1909 // but also, this line should go full to the right of the block, | |
| 1910 // disallowing a subsequent word from having fit on this line. | |
| 1911 | |
| 1912 // First row: | |
| 1913 { | |
| 1914 RowScratchRegisters &curr = (*rows)[row_start]; | |
| 1915 RowScratchRegisters &next = (*rows)[row_start + 1]; | |
| 1916 tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; | |
| 1917 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) && | |
| 1918 (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) { | |
| 1919 curr.SetStartLine(); | |
| 1920 } | |
| 1921 } | |
| 1922 // Middle rows | |
| 1923 for (int i = row_start + 1; i < row_end - 1; i++) { | |
| 1924 RowScratchRegisters &prev = (*rows)[i - 1]; | |
| 1925 RowScratchRegisters &curr = (*rows)[i]; | |
| 1926 RowScratchRegisters &next = (*rows)[i + 1]; | |
| 1927 tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; | |
| 1928 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) && | |
| 1929 LikelyParagraphStart(prev, curr, j)) { | |
| 1930 curr.SetStartLine(); | |
| 1931 } | |
| 1932 } | |
| 1933 // Last row | |
| 1934 { // the short circuit at the top means we have at least two lines. | |
| 1935 RowScratchRegisters &prev = (*rows)[row_end - 2]; | |
| 1936 RowScratchRegisters &curr = (*rows)[row_end - 1]; | |
| 1937 tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; | |
| 1938 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) && | |
| 1939 LikelyParagraphStart(prev, curr, j)) { | |
| 1940 curr.SetStartLine(); | |
| 1941 } | |
| 1942 } | |
| 1943 } | |
| 1944 | |
| 1945 // Look for sequences of a start line followed by some body lines in | |
| 1946 // rows[row_start, row_end) and create ParagraphModels for them if | |
| 1947 // they seem coherent. | |
| 1948 static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows, | |
| 1949 int row_start, int row_end, bool allow_flush_models, | |
| 1950 ParagraphTheory *theory) { | |
| 1951 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) { | |
| 1952 return; | |
| 1953 } | |
| 1954 | |
| 1955 int start = row_start; | |
| 1956 while (start < row_end) { | |
| 1957 while (start < row_end && (*rows)[start].GetLineType() != LT_START) { | |
| 1958 start++; | |
| 1959 } | |
| 1960 if (start >= row_end - 1) { | |
| 1961 break; | |
| 1962 } | |
| 1963 | |
| 1964 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); | |
| 1965 int end = start; | |
| 1966 ParagraphModel last_model; | |
| 1967 bool next_consistent; | |
| 1968 do { | |
| 1969 ++end; | |
| 1970 // rows[row, end) was consistent. | |
| 1971 // If rows[row, end + 1) is not consistent, | |
| 1972 // just model rows[row, end) | |
| 1973 if (end < row_end - 1) { | |
| 1974 RowScratchRegisters &next = (*rows)[end]; | |
| 1975 LineType lt = next.GetLineType(); | |
| 1976 next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN && | |
| 1977 !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); | |
| 1978 } else { | |
| 1979 next_consistent = false; | |
| 1980 } | |
| 1981 if (next_consistent) { | |
| 1982 ParagraphModel next_model = | |
| 1983 InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent); | |
| 1984 if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT && | |
| 1985 next_model.justification() != JUSTIFICATION_LEFT) || | |
| 1986 (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT && | |
| 1987 next_model.justification() != JUSTIFICATION_RIGHT)) { | |
| 1988 next_consistent = false; | |
| 1989 } | |
| 1990 last_model = next_model; | |
| 1991 } else { | |
| 1992 next_consistent = false; | |
| 1993 } | |
| 1994 } while (next_consistent && end < row_end); | |
| 1995 // At this point, rows[start, end) looked like it could have been a | |
| 1996 // single paragraph. If we can make a good ParagraphModel for it, | |
| 1997 // do so and mark this sequence with that model. | |
| 1998 if (end > start + 1) { | |
| 1999 // emit a new paragraph if we have more than one line. | |
| 2000 const ParagraphModel *model = nullptr; | |
| 2001 ParagraphModel new_model = ParagraphModelByOutline( | |
| 2002 debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end))); | |
| 2003 if (new_model.justification() == JUSTIFICATION_UNKNOWN) { | |
| 2004 // couldn't create a good model, oh well. | |
| 2005 } else if (new_model.is_flush()) { | |
| 2006 if (end == start + 2) { | |
| 2007 // It's very likely we just got two paragraph starts in a row. | |
| 2008 end = start + 1; | |
| 2009 } else if (start == row_start) { | |
| 2010 // Mark this as a Crown. | |
| 2011 if (new_model.justification() == JUSTIFICATION_LEFT) { | |
| 2012 model = kCrownLeft; | |
| 2013 } else { | |
| 2014 model = kCrownRight; | |
| 2015 } | |
| 2016 } else if (allow_flush_models) { | |
| 2017 model = theory->AddModel(new_model); | |
| 2018 } | |
| 2019 } else { | |
| 2020 model = theory->AddModel(new_model); | |
| 2021 } | |
| 2022 if (model) { | |
| 2023 (*rows)[start].AddStartLine(model); | |
| 2024 for (int i = start + 1; i < end; i++) { | |
| 2025 (*rows)[i].AddBodyLine(model); | |
| 2026 } | |
| 2027 } | |
| 2028 } | |
| 2029 start = end; | |
| 2030 } | |
| 2031 } | |
| 2032 | |
| 2033 // We examine rows[row_start, row_end) and do the following: | |
| 2034 // (1) Clear all existing hypotheses for the rows being considered. | |
| 2035 // (2) Mark up any rows as exceptionally likely to be paragraph starts | |
| 2036 // or paragraph body lines as such using both geometric and textual | |
| 2037 // clues. | |
| 2038 // (3) Form models for any sequence of start + continuation lines. | |
| 2039 // (4) Smear the paragraph models to cover surrounding text. | |
| 2040 static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows, | |
| 2041 int row_start, int row_end, ParagraphTheory *theory) { | |
| 2042 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) { | |
| 2043 return; | |
| 2044 } | |
| 2045 | |
| 2046 if (debug_level > 1) { | |
| 2047 tprintf("#############################################\n"); | |
| 2048 tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); | |
| 2049 tprintf("#############################################\n"); | |
| 2050 } | |
| 2051 | |
| 2052 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); | |
| 2053 MarkStrongEvidence(rows, row_start, row_end); | |
| 2054 | |
| 2055 DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); | |
| 2056 | |
| 2057 // Create paragraph models. | |
| 2058 ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); | |
| 2059 | |
| 2060 DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); | |
| 2061 | |
| 2062 // At this point, some rows are marked up as paragraphs with model numbers, | |
| 2063 // and some rows are marked up as either LT_START or LT_BODY. Now let's | |
| 2064 // smear any good paragraph hypotheses forward and backward. | |
| 2065 ParagraphModelSmearer smearer(rows, row_start, row_end, theory); | |
| 2066 smearer.Smear(); | |
| 2067 } | |
| 2068 | |
| 2069 static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start, | |
| 2070 int row_end, ParagraphTheory *theory) { | |
| 2071 for (int i = row_start + 1; i < row_end - 1; i++) { | |
| 2072 if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders && | |
| 2073 (*rows)[i + 1].ri_->has_leaders) { | |
| 2074 const ParagraphModel *model = | |
| 2075 theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); | |
| 2076 (*rows)[i].AddStartLine(model); | |
| 2077 } | |
| 2078 } | |
| 2079 } | |
| 2080 | |
| 2081 // Collect sequences of unique hypotheses in row registers and create proper | |
| 2082 // paragraphs for them, referencing the paragraphs in row_owners. | |
| 2083 static void ConvertHypothesizedModelRunsToParagraphs(int debug_level, | |
| 2084 std::vector<RowScratchRegisters> &rows, | |
| 2085 std::vector<PARA *> *row_owners, | |
| 2086 ParagraphTheory *theory) { | |
| 2087 int end = rows.size(); | |
| 2088 int start; | |
| 2089 for (; end > 0; end = start) { | |
| 2090 start = end - 1; | |
| 2091 const ParagraphModel *model = nullptr; | |
| 2092 // TODO(eger): Be smarter about dealing with multiple hypotheses. | |
| 2093 bool single_line_paragraph = false; | |
| 2094 SetOfModels models; | |
| 2095 rows[start].NonNullHypotheses(&models); | |
| 2096 if (!models.empty()) { | |
| 2097 model = models[0]; | |
| 2098 if (rows[start].GetLineType(model) != LT_BODY) { | |
| 2099 single_line_paragraph = true; | |
| 2100 } | |
| 2101 } | |
| 2102 if (model && !single_line_paragraph) { | |
| 2103 // walk back looking for more body lines and then a start line. | |
| 2104 while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { | |
| 2105 // do nothing | |
| 2106 } | |
| 2107 if (start < 0 || rows[start].GetLineType(model) != LT_START) { | |
| 2108 model = nullptr; | |
| 2109 } | |
| 2110 } | |
| 2111 if (model == nullptr) { | |
| 2112 continue; | |
| 2113 } | |
| 2114 // rows[start, end) should be a paragraph. | |
| 2115 PARA *p = new PARA(); | |
| 2116 if (model == kCrownLeft || model == kCrownRight) { | |
| 2117 p->is_very_first_or_continuation = true; | |
| 2118 // Crown paragraph. | |
| 2119 // If we can find an existing ParagraphModel that fits, use it, | |
| 2120 // else create a new one. | |
| 2121 for (unsigned row = end; row < rows.size(); row++) { | |
| 2122 if ((*row_owners)[row] && | |
| 2123 (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && | |
| 2124 (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { | |
| 2125 model = (*row_owners)[row]->model; | |
| 2126 break; | |
| 2127 } | |
| 2128 } | |
| 2129 if (model == kCrownLeft) { | |
| 2130 // No subsequent model fits, so cons one up. | |
| 2131 model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT, | |
| 2132 rows[start].lmargin_ + rows[start].lindent_, 0, 0, | |
| 2133 Epsilon(rows[start].ri_->average_interword_space))); | |
| 2134 } else if (model == kCrownRight) { | |
| 2135 // No subsequent model fits, so cons one up. | |
| 2136 model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT, | |
| 2137 rows[start].rmargin_ + rows[start].rmargin_, 0, 0, | |
| 2138 Epsilon(rows[start].ri_->average_interword_space))); | |
| 2139 } | |
| 2140 } | |
| 2141 rows[start].SetUnknown(); | |
| 2142 rows[start].AddStartLine(model); | |
| 2143 for (int i = start + 1; i < end; i++) { | |
| 2144 rows[i].SetUnknown(); | |
| 2145 rows[i].AddBodyLine(model); | |
| 2146 } | |
| 2147 p->model = model; | |
| 2148 p->has_drop_cap = rows[start].ri_->has_drop_cap; | |
| 2149 p->is_list_item = model->justification() == JUSTIFICATION_RIGHT | |
| 2150 ? rows[start].ri_->rword_indicates_list_item | |
| 2151 : rows[start].ri_->lword_indicates_list_item; | |
| 2152 for (int row = start; row < end; row++) { | |
| 2153 if ((*row_owners)[row] != nullptr) { | |
| 2154 tprintf( | |
| 2155 "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " | |
| 2156 "more than once!\n"); | |
| 2157 delete (*row_owners)[row]; | |
| 2158 } | |
| 2159 (*row_owners)[row] = p; | |
| 2160 } | |
| 2161 } | |
| 2162 } | |
| 2163 | |
| 2164 struct Interval { | |
| 2165 Interval() : begin(0), end(0) {} | |
| 2166 Interval(int b, int e) : begin(b), end(e) {} | |
| 2167 | |
| 2168 int begin; | |
| 2169 int end; | |
| 2170 }; | |
| 2171 | |
| 2172 // Return whether rows[row] appears to be stranded, meaning that the evidence | |
| 2173 // for this row is very weak due to context. For instance, two lines of source | |
| 2174 // code may happen to be indented at the same tab vector as body text starts, | |
| 2175 // leading us to think they are two start-of-paragraph lines. This is not | |
| 2176 // optimal. However, we also don't want to mark a sequence of short dialog | |
| 2177 // as "weak," so our heuristic is: | |
| 2178 // (1) If a line is surrounded by lines of unknown type, it's weak. | |
| 2179 // (2) If two lines in a row are start lines for a given paragraph type, but | |
| 2180 // after that the same paragraph type does not continue, they're weak. | |
| 2181 static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) { | |
| 2182 SetOfModels row_models; | |
| 2183 rows[row].StrongHypotheses(&row_models); | |
| 2184 | |
| 2185 for (auto &row_model : row_models) { | |
| 2186 bool all_starts = rows[row].GetLineType(); | |
| 2187 int run_length = 1; | |
| 2188 bool continues = true; | |
| 2189 for (int i = row - 1; i >= 0 && continues; i--) { | |
| 2190 SetOfModels models; | |
| 2191 rows[i].NonNullHypotheses(&models); | |
| 2192 switch (rows[i].GetLineType(row_model)) { | |
| 2193 case LT_START: | |
| 2194 run_length++; | |
| 2195 break; | |
| 2196 case LT_MULTIPLE: // explicit fall-through | |
| 2197 case LT_BODY: | |
| 2198 run_length++; | |
| 2199 all_starts = false; | |
| 2200 break; | |
| 2201 case LT_UNKNOWN: // explicit fall-through | |
| 2202 default: | |
| 2203 continues = false; | |
| 2204 } | |
| 2205 } | |
| 2206 continues = true; | |
| 2207 for (unsigned i = row + 1; i < rows.size() && continues; i++) { | |
| 2208 SetOfModels models; | |
| 2209 rows[i].NonNullHypotheses(&models); | |
| 2210 switch (rows[i].GetLineType(row_model)) { | |
| 2211 case LT_START: | |
| 2212 run_length++; | |
| 2213 break; | |
| 2214 case LT_MULTIPLE: // explicit fall-through | |
| 2215 case LT_BODY: | |
| 2216 run_length++; | |
| 2217 all_starts = false; | |
| 2218 break; | |
| 2219 case LT_UNKNOWN: // explicit fall-through | |
| 2220 default: | |
| 2221 continues = false; | |
| 2222 } | |
| 2223 } | |
| 2224 if (run_length > 2 || (!all_starts && run_length > 1)) { | |
| 2225 return false; | |
| 2226 } | |
| 2227 } | |
| 2228 return true; | |
| 2229 } | |
| 2230 | |
| 2231 // Go through rows[row_start, row_end) and gather up sequences that need better | |
| 2232 // classification. | |
| 2233 // + Sequences of non-empty rows without hypotheses. | |
| 2234 // + Crown paragraphs not immediately followed by a strongly modeled line. | |
| 2235 // + Single line paragraphs surrounded by text that doesn't match the | |
| 2236 // model. | |
| 2237 static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows, | |
| 2238 std::vector<Interval> *to_fix, int row_start, int row_end) { | |
| 2239 to_fix->clear(); | |
| 2240 for (int i = row_start; i < row_end; i++) { | |
| 2241 bool needs_fixing = false; | |
| 2242 | |
| 2243 SetOfModels models; | |
| 2244 SetOfModels models_w_crowns; | |
| 2245 rows[i].StrongHypotheses(&models); | |
| 2246 rows[i].NonNullHypotheses(&models_w_crowns); | |
| 2247 if (models.empty() && !models_w_crowns.empty()) { | |
| 2248 // Crown paragraph. Is it followed by a modeled line? | |
| 2249 for (unsigned end = i + 1; end < rows.size(); end++) { | |
| 2250 SetOfModels end_models; | |
| 2251 SetOfModels strong_end_models; | |
| 2252 rows[end].NonNullHypotheses(&end_models); | |
| 2253 rows[end].StrongHypotheses(&strong_end_models); | |
| 2254 if (end_models.empty()) { | |
| 2255 needs_fixing = true; | |
| 2256 break; | |
| 2257 } else if (!strong_end_models.empty()) { | |
| 2258 needs_fixing = false; | |
| 2259 break; | |
| 2260 } | |
| 2261 } | |
| 2262 } else if (models.empty() && rows[i].ri_->num_words > 0) { | |
| 2263 // No models at all. | |
| 2264 needs_fixing = true; | |
| 2265 } | |
| 2266 | |
| 2267 if (!needs_fixing && !models.empty()) { | |
| 2268 needs_fixing = RowIsStranded(rows, i); | |
| 2269 } | |
| 2270 | |
| 2271 if (needs_fixing) { | |
| 2272 if (!to_fix->empty() && to_fix->back().end == i - 1) { | |
| 2273 to_fix->back().end = i; | |
| 2274 } else { | |
| 2275 to_fix->push_back(Interval(i, i)); | |
| 2276 } | |
| 2277 } | |
| 2278 } | |
| 2279 // Convert inclusive intervals to half-open intervals. | |
| 2280 for (auto &i : *to_fix) { | |
| 2281 i.end = i.end + 1; | |
| 2282 } | |
| 2283 } | |
| 2284 | |
| 2285 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), | |
| 2286 // normalize each row_owner to point to an actual PARA, and output the | |
| 2287 // paragraphs in order onto paragraphs. | |
| 2288 void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) { | |
| 2289 std::vector<PARA *> &rows = *row_owners; | |
| 2290 paragraphs->clear(); | |
| 2291 PARA_IT out(paragraphs); | |
| 2292 PARA *formerly_null = nullptr; | |
| 2293 for (unsigned i = 0; i < rows.size(); i++) { | |
| 2294 if (rows[i] == nullptr) { | |
| 2295 if (i == 0 || rows[i - 1] != formerly_null) { | |
| 2296 rows[i] = formerly_null = new PARA(); | |
| 2297 } else { | |
| 2298 rows[i] = formerly_null; | |
| 2299 continue; | |
| 2300 } | |
| 2301 } else if (i > 0 && rows[i - 1] == rows[i]) { | |
| 2302 continue; | |
| 2303 } | |
| 2304 out.add_after_then_move(rows[i]); | |
| 2305 } | |
| 2306 } | |
| 2307 | |
| 2308 // Main entry point for Paragraph Detection Algorithm. | |
| 2309 // | |
| 2310 // Given a set of equally spaced textlines (described by row_infos), | |
| 2311 // Split them into paragraphs. | |
| 2312 // | |
| 2313 // Output: | |
| 2314 // row_owners - one pointer for each row, to the paragraph it belongs to. | |
| 2315 // paragraphs - this is the actual list of PARA objects. | |
| 2316 // models - the list of paragraph models referenced by the PARA objects. | |
| 2317 // caller is responsible for deleting the models. | |
| 2318 void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos, | |
| 2319 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs, | |
| 2320 std::vector<ParagraphModel *> *models) { | |
| 2321 ParagraphTheory theory(models); | |
| 2322 | |
| 2323 // Initialize row_owners to be a bunch of nullptr pointers. | |
| 2324 row_owners->clear(); | |
| 2325 row_owners->resize(row_infos->size()); | |
| 2326 | |
| 2327 // Set up row scratch registers for the main algorithm. | |
| 2328 std::vector<RowScratchRegisters> rows(row_infos->size()); | |
| 2329 for (unsigned i = 0; i < row_infos->size(); i++) { | |
| 2330 rows[i].Init((*row_infos)[i]); | |
| 2331 } | |
| 2332 | |
| 2333 // Pass 1: | |
| 2334 // Detect sequences of lines that all contain leader dots (.....) | |
| 2335 // These are likely Tables of Contents. If there are three text lines in | |
| 2336 // a row with leader dots, it's pretty safe to say the middle one should | |
| 2337 // be a paragraph of its own. | |
| 2338 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); | |
| 2339 | |
| 2340 DebugDump(debug_level > 1, "End of Pass 1", theory, rows); | |
| 2341 | |
| 2342 std::vector<Interval> leftovers; | |
| 2343 LeftoverSegments(rows, &leftovers, 0, rows.size()); | |
| 2344 for (auto &leftover : leftovers) { | |
| 2345 // Pass 2a: | |
| 2346 // Find any strongly evidenced start-of-paragraph lines. If they're | |
| 2347 // followed by two lines that look like body lines, make a paragraph | |
| 2348 // model for that and see if that model applies throughout the text | |
| 2349 // (that is, "smear" it). | |
| 2350 StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory); | |
| 2351 | |
| 2352 // Pass 2b: | |
| 2353 // If we had any luck in pass 2a, we got part of the page and didn't | |
| 2354 // know how to classify a few runs of rows. Take the segments that | |
| 2355 // didn't find a model and reprocess them individually. | |
| 2356 std::vector<Interval> leftovers2; | |
| 2357 LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end); | |
| 2358 bool pass2a_was_useful = | |
| 2359 leftovers2.size() > 1 || | |
| 2360 (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size())); | |
| 2361 if (pass2a_was_useful) { | |
| 2362 for (auto &leftover2 : leftovers2) { | |
| 2363 StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory); | |
| 2364 } | |
| 2365 } | |
| 2366 } | |
| 2367 | |
| 2368 DebugDump(debug_level > 1, "End of Pass 2", theory, rows); | |
| 2369 | |
| 2370 // Pass 3: | |
| 2371 // These are the dregs for which we didn't have enough strong textual | |
| 2372 // and geometric clues to form matching models for. Let's see if | |
| 2373 // the geometric clues are simple enough that we could just use those. | |
| 2374 LeftoverSegments(rows, &leftovers, 0, rows.size()); | |
| 2375 for (auto &leftover : leftovers) { | |
| 2376 GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory); | |
| 2377 } | |
| 2378 | |
| 2379 // Undo any flush models for which there's little evidence. | |
| 2380 DowngradeWeakestToCrowns(debug_level, &theory, &rows); | |
| 2381 | |
| 2382 DebugDump(debug_level > 1, "End of Pass 3", theory, rows); | |
| 2383 | |
| 2384 // Pass 4: | |
| 2385 // Take everything that's still not marked up well and clear all markings. | |
| 2386 LeftoverSegments(rows, &leftovers, 0, rows.size()); | |
| 2387 for (auto &leftover : leftovers) { | |
| 2388 for (int j = leftover.begin; j < leftover.end; j++) { | |
| 2389 rows[j].SetUnknown(); | |
| 2390 } | |
| 2391 } | |
| 2392 | |
| 2393 DebugDump(debug_level > 1, "End of Pass 4", theory, rows); | |
| 2394 | |
| 2395 // Convert all of the unique hypothesis runs to PARAs. | |
| 2396 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory); | |
| 2397 | |
| 2398 DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); | |
| 2399 | |
| 2400 // Finally, clean up any dangling nullptr row paragraph parents. | |
| 2401 CanonicalizeDetectionResults(row_owners, paragraphs); | |
| 2402 } | |
| 2403 | |
| 2404 // ============ Code interfacing with the rest of Tesseract ================== | |
| 2405 | |
| 2406 static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) { | |
| 2407 // Set up text, lword_text, and rword_text (mostly for debug printing). | |
| 2408 std::string fake_text; | |
| 2409 PageIterator pit(static_cast<const PageIterator &>(it)); | |
| 2410 if (!pit.Empty(RIL_WORD)) { | |
| 2411 bool first_word = true; | |
| 2412 do { | |
| 2413 fake_text += "x"; | |
| 2414 if (first_word) { | |
| 2415 info->lword_text += "x"; | |
| 2416 } | |
| 2417 info->rword_text += "x"; | |
| 2418 if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) && | |
| 2419 !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) { | |
| 2420 fake_text += " "; | |
| 2421 info->rword_text = ""; | |
| 2422 first_word = false; | |
| 2423 } | |
| 2424 } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL)); | |
| 2425 } | |
| 2426 if (fake_text.empty()) { | |
| 2427 return; | |
| 2428 } | |
| 2429 | |
| 2430 int lspaces = info->pix_ldistance / info->average_interword_space; | |
| 2431 for (int i = 0; i < lspaces; i++) { | |
| 2432 info->text += ' '; | |
| 2433 } | |
| 2434 info->text += fake_text; | |
| 2435 | |
| 2436 // Set up lword_box, rword_box, and num_words. | |
| 2437 PAGE_RES_IT page_res_it = *it.PageResIt(); | |
| 2438 WERD_RES *word_res = page_res_it.restart_row(); | |
| 2439 ROW_RES *this_row = page_res_it.row(); | |
| 2440 | |
| 2441 WERD_RES *lword = nullptr; | |
| 2442 WERD_RES *rword = nullptr; | |
| 2443 info->num_words = 0; | |
| 2444 do { | |
| 2445 if (word_res) { | |
| 2446 if (!lword) { | |
| 2447 lword = word_res; | |
| 2448 } | |
| 2449 if (rword != word_res) { | |
| 2450 info->num_words++; | |
| 2451 } | |
| 2452 rword = word_res; | |
| 2453 } | |
| 2454 word_res = page_res_it.forward(); | |
| 2455 } while (page_res_it.row() == this_row); | |
| 2456 | |
| 2457 if (lword) { | |
| 2458 info->lword_box = lword->word->bounding_box(); | |
| 2459 } | |
| 2460 if (rword) { | |
| 2461 info->rword_box = rword->word->bounding_box(); | |
| 2462 } | |
| 2463 } | |
| 2464 | |
| 2465 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph | |
| 2466 // detector RowInfo with all relevant information from the row. | |
| 2467 static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) { | |
| 2468 if (it.PageResIt()->row() != nullptr) { | |
| 2469 ROW *row = it.PageResIt()->row()->row; | |
| 2470 info->pix_ldistance = row->lmargin(); | |
| 2471 info->pix_rdistance = row->rmargin(); | |
| 2472 info->average_interword_space = | |
| 2473 row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1); | |
| 2474 info->pix_xheight = row->x_height(); | |
| 2475 info->has_leaders = false; | |
| 2476 info->has_drop_cap = row->has_drop_cap(); | |
| 2477 info->ltr = true; // set below depending on word scripts | |
| 2478 } else { | |
| 2479 info->pix_ldistance = info->pix_rdistance = 0; | |
| 2480 info->average_interword_space = 1; | |
| 2481 info->pix_xheight = 1.0; | |
| 2482 info->has_leaders = false; | |
| 2483 info->has_drop_cap = false; | |
| 2484 info->ltr = true; | |
| 2485 } | |
| 2486 | |
| 2487 info->num_words = 0; | |
| 2488 info->lword_indicates_list_item = false; | |
| 2489 info->lword_likely_starts_idea = false; | |
| 2490 info->lword_likely_ends_idea = false; | |
| 2491 info->rword_indicates_list_item = false; | |
| 2492 info->rword_likely_starts_idea = false; | |
| 2493 info->rword_likely_ends_idea = false; | |
| 2494 info->has_leaders = false; | |
| 2495 info->ltr = true; | |
| 2496 | |
| 2497 if (!after_recognition) { | |
| 2498 InitializeTextAndBoxesPreRecognition(it, info); | |
| 2499 return; | |
| 2500 } | |
| 2501 info->text = ""; | |
| 2502 const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE)); | |
| 2503 int trailing_ws_idx = strlen(text.get()); // strip trailing space | |
| 2504 while (trailing_ws_idx > 0 && | |
| 2505 // isspace() only takes ASCII | |
| 2506 isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) { | |
| 2507 trailing_ws_idx--; | |
| 2508 } | |
| 2509 if (trailing_ws_idx > 0) { | |
| 2510 int lspaces = info->pix_ldistance / info->average_interword_space; | |
| 2511 for (int i = 0; i < lspaces; i++) { | |
| 2512 info->text += ' '; | |
| 2513 } | |
| 2514 for (int i = 0; i < trailing_ws_idx; i++) { | |
| 2515 info->text += text[i]; | |
| 2516 } | |
| 2517 } | |
| 2518 | |
| 2519 if (info->text.empty()) { | |
| 2520 return; | |
| 2521 } | |
| 2522 | |
| 2523 PAGE_RES_IT page_res_it = *it.PageResIt(); | |
| 2524 std::vector<WERD_RES *> werds; | |
| 2525 WERD_RES *word_res = page_res_it.restart_row(); | |
| 2526 ROW_RES *this_row = page_res_it.row(); | |
| 2527 int num_leaders = 0; | |
| 2528 int ltr = 0; | |
| 2529 int rtl = 0; | |
| 2530 do { | |
| 2531 if (word_res && word_res->best_choice->unichar_string().length() > 0) { | |
| 2532 werds.push_back(word_res); | |
| 2533 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; | |
| 2534 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; | |
| 2535 if (word_res->word->flag(W_REP_CHAR)) { | |
| 2536 num_leaders++; | |
| 2537 } | |
| 2538 } | |
| 2539 word_res = page_res_it.forward(); | |
| 2540 } while (page_res_it.row() == this_row); | |
| 2541 info->ltr = ltr >= rtl; | |
| 2542 info->has_leaders = num_leaders > 3; | |
| 2543 info->num_words = werds.size(); | |
| 2544 if (!werds.empty()) { | |
| 2545 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; | |
| 2546 info->lword_text = lword->best_choice->unichar_string().c_str(); | |
| 2547 info->rword_text = rword->best_choice->unichar_string().c_str(); | |
| 2548 info->lword_box = lword->word->bounding_box(); | |
| 2549 info->rword_box = rword->word->bounding_box(); | |
| 2550 LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text, | |
| 2551 &info->lword_indicates_list_item, &info->lword_likely_starts_idea, | |
| 2552 &info->lword_likely_ends_idea); | |
| 2553 RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text, | |
| 2554 &info->rword_indicates_list_item, &info->rword_likely_starts_idea, | |
| 2555 &info->rword_likely_ends_idea); | |
| 2556 } | |
| 2557 } | |
| 2558 | |
| 2559 // This is called after rows have been identified and words are recognized. | |
| 2560 // Much of this could be implemented before word recognition, but text helps | |
| 2561 // to identify bulleted lists and gives good signals for sentence boundaries. | |
| 2562 void DetectParagraphs(int debug_level, bool after_text_recognition, | |
| 2563 const MutableIterator *block_start, std::vector<ParagraphModel *> *models) { | |
| 2564 // Clear out any preconceived notions. | |
| 2565 if (block_start->Empty(RIL_TEXTLINE)) { | |
| 2566 return; | |
| 2567 } | |
| 2568 BLOCK *block = block_start->PageResIt()->block()->block; | |
| 2569 block->para_list()->clear(); | |
| 2570 bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText(); | |
| 2571 | |
| 2572 // Convert the Tesseract structures to RowInfos | |
| 2573 // for the paragraph detection algorithm. | |
| 2574 MutableIterator row(*block_start); | |
| 2575 if (row.Empty(RIL_TEXTLINE)) { | |
| 2576 return; // end of input already. | |
| 2577 } | |
| 2578 | |
| 2579 std::vector<RowInfo> row_infos; | |
| 2580 do { | |
| 2581 if (!row.PageResIt()->row()) { | |
| 2582 continue; // empty row. | |
| 2583 } | |
| 2584 row.PageResIt()->row()->row->set_para(nullptr); | |
| 2585 row_infos.emplace_back(); | |
| 2586 RowInfo &ri = row_infos.back(); | |
| 2587 InitializeRowInfo(after_text_recognition, row, &ri); | |
| 2588 } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE)); | |
| 2589 | |
| 2590 // If we're called before text recognition, we might not have | |
| 2591 // tight block bounding boxes, so trim by the minimum on each side. | |
| 2592 if (!row_infos.empty()) { | |
| 2593 int min_lmargin = row_infos[0].pix_ldistance; | |
| 2594 int min_rmargin = row_infos[0].pix_rdistance; | |
| 2595 for (unsigned i = 1; i < row_infos.size(); i++) { | |
| 2596 if (row_infos[i].pix_ldistance < min_lmargin) { | |
| 2597 min_lmargin = row_infos[i].pix_ldistance; | |
| 2598 } | |
| 2599 if (row_infos[i].pix_rdistance < min_rmargin) { | |
| 2600 min_rmargin = row_infos[i].pix_rdistance; | |
| 2601 } | |
| 2602 } | |
| 2603 if (min_lmargin > 0 || min_rmargin > 0) { | |
| 2604 for (auto &row_info : row_infos) { | |
| 2605 row_info.pix_ldistance -= min_lmargin; | |
| 2606 row_info.pix_rdistance -= min_rmargin; | |
| 2607 } | |
| 2608 } | |
| 2609 } | |
| 2610 | |
| 2611 // Run the paragraph detection algorithm. | |
| 2612 std::vector<PARA *> row_owners; | |
| 2613 if (!is_image_block) { | |
| 2614 DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models); | |
| 2615 } else { | |
| 2616 row_owners.resize(row_infos.size()); | |
| 2617 CanonicalizeDetectionResults(&row_owners, block->para_list()); | |
| 2618 } | |
| 2619 | |
| 2620 // Now stitch in the row_owners into the rows. | |
| 2621 row = *block_start; | |
| 2622 for (auto &row_owner : row_owners) { | |
| 2623 while (!row.PageResIt()->row()) { | |
| 2624 row.Next(RIL_TEXTLINE); | |
| 2625 } | |
| 2626 row.PageResIt()->row()->row->set_para(row_owner); | |
| 2627 row.Next(RIL_TEXTLINE); | |
| 2628 } | |
| 2629 } | |
| 2630 | |
| 2631 } // namespace tesseract |
