Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/colpartitionset.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: colpartitionset.cpp | |
| 3 // Description: Class to hold a list of ColPartitions of the page that | |
| 4 // correspond roughly to columns. | |
| 5 // Author: Ray Smith | |
| 6 // | |
| 7 // (C) Copyright 2008, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "colpartitionset.h" | |
| 25 #include "tablefind.h" | |
| 26 #include "workingpartset.h" | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 // Minimum width of a column to be interesting as a multiple of resolution. | |
| 31 const double kMinColumnWidth = 2.0 / 3; | |
| 32 | |
| 33 ColPartitionSet::ColPartitionSet(ColPartition_LIST *partitions) { | |
| 34 ColPartition_IT it(&parts_); | |
| 35 it.add_list_after(partitions); | |
| 36 ComputeCoverage(); | |
| 37 } | |
| 38 | |
| 39 ColPartitionSet::ColPartitionSet(ColPartition *part) { | |
| 40 ColPartition_IT it(&parts_); | |
| 41 it.add_after_then_move(part); | |
| 42 ComputeCoverage(); | |
| 43 } | |
| 44 | |
| 45 // Returns the number of columns of good width. | |
| 46 int ColPartitionSet::GoodColumnCount() const { | |
| 47 int num_good_cols = 0; | |
| 48 // This is a read-only iteration of the list. | |
| 49 ColPartition_IT it(const_cast<ColPartition_LIST *>(&parts_)); | |
| 50 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 51 if (it.data()->good_width()) { | |
| 52 ++num_good_cols; | |
| 53 } | |
| 54 } | |
| 55 return num_good_cols; | |
| 56 } | |
| 57 | |
| 58 // Return an element of the parts_ list from its index. | |
| 59 ColPartition *ColPartitionSet::GetColumnByIndex(int index) { | |
| 60 ColPartition_IT it(&parts_); | |
| 61 it.mark_cycle_pt(); | |
| 62 for (int i = 0; i < index && !it.cycled_list(); ++i, it.forward()) { | |
| 63 ; | |
| 64 } | |
| 65 if (it.cycled_list()) { | |
| 66 return nullptr; | |
| 67 } | |
| 68 return it.data(); | |
| 69 } | |
| 70 | |
| 71 // Return the ColPartition that contains the given coords, if any, else nullptr. | |
| 72 ColPartition *ColPartitionSet::ColumnContaining(int x, int y) { | |
| 73 ColPartition_IT it(&parts_); | |
| 74 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 75 ColPartition *part = it.data(); | |
| 76 if (part->ColumnContains(x, y)) { | |
| 77 return part; | |
| 78 } | |
| 79 } | |
| 80 return nullptr; | |
| 81 } | |
| 82 | |
| 83 // Extract all the parts from the list, relinquishing ownership. | |
| 84 void ColPartitionSet::RelinquishParts() { | |
| 85 ColPartition_IT it(&parts_); | |
| 86 while (!it.empty()) { | |
| 87 it.extract(); | |
| 88 it.forward(); | |
| 89 } | |
| 90 } | |
| 91 | |
| 92 // Attempt to improve this by adding partitions or expanding partitions. | |
| 93 void ColPartitionSet::ImproveColumnCandidate(const WidthCallback &cb, | |
| 94 PartSetVector *src_sets) { | |
| 95 int set_size = src_sets->size(); | |
| 96 // Iterate over the provided column sets, as each one may have something | |
| 97 // to improve this. | |
| 98 for (int i = 0; i < set_size; ++i) { | |
| 99 ColPartitionSet *column_set = src_sets->at(i); | |
| 100 if (column_set == nullptr) { | |
| 101 continue; | |
| 102 } | |
| 103 // Iterate over the parts in this and column_set, adding bigger or | |
| 104 // new parts in column_set to this. | |
| 105 ColPartition_IT part_it(&parts_); | |
| 106 ASSERT_HOST(!part_it.empty()); | |
| 107 int prev_right = INT32_MIN; | |
| 108 part_it.mark_cycle_pt(); | |
| 109 ColPartition_IT col_it(&column_set->parts_); | |
| 110 for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) { | |
| 111 ColPartition *col_part = col_it.data(); | |
| 112 if (col_part->blob_type() < BRT_UNKNOWN) { | |
| 113 continue; // Ignore image partitions. | |
| 114 } | |
| 115 int col_left = col_part->left_key(); | |
| 116 int col_right = col_part->right_key(); | |
| 117 // Sync-up part_it (in this) so it matches the col_part in column_set. | |
| 118 ColPartition *part = part_it.data(); | |
| 119 while (!part_it.at_last() && part->right_key() < col_left) { | |
| 120 prev_right = part->right_key(); | |
| 121 part_it.forward(); | |
| 122 part = part_it.data(); | |
| 123 } | |
| 124 int part_left = part->left_key(); | |
| 125 int part_right = part->right_key(); | |
| 126 if (part_right < col_left || col_right < part_left) { | |
| 127 // There is no overlap so this is a new partition. | |
| 128 AddPartition(col_part->ShallowCopy(), &part_it); | |
| 129 continue; | |
| 130 } | |
| 131 // Check the edges of col_part to see if they can improve part. | |
| 132 bool part_width_ok = cb(part->KeyWidth(part_left, part_right)); | |
| 133 if (col_left < part_left && col_left > prev_right) { | |
| 134 // The left edge of the column is better and it doesn't overlap, | |
| 135 // so we can potentially expand it. | |
| 136 int col_box_left = col_part->BoxLeftKey(); | |
| 137 bool tab_width_ok = cb(part->KeyWidth(col_left, part_right)); | |
| 138 bool box_width_ok = cb(part->KeyWidth(col_box_left, part_right)); | |
| 139 if (tab_width_ok || (!part_width_ok)) { | |
| 140 // The tab is leaving the good column metric at least as good as | |
| 141 // it was before, so use the tab. | |
| 142 part->CopyLeftTab(*col_part, false); | |
| 143 part->SetColumnGoodness(cb); | |
| 144 } else if (col_box_left < part_left && | |
| 145 (box_width_ok || !part_width_ok)) { | |
| 146 // The box is leaving the good column metric at least as good as | |
| 147 // it was before, so use the box. | |
| 148 part->CopyLeftTab(*col_part, true); | |
| 149 part->SetColumnGoodness(cb); | |
| 150 } | |
| 151 part_left = part->left_key(); | |
| 152 } | |
| 153 if (col_right > part_right && | |
| 154 (part_it.at_last() || | |
| 155 part_it.data_relative(1)->left_key() > col_right)) { | |
| 156 // The right edge is better, so we can possibly expand it. | |
| 157 int col_box_right = col_part->BoxRightKey(); | |
| 158 bool tab_width_ok = cb(part->KeyWidth(part_left, col_right)); | |
| 159 bool box_width_ok = cb(part->KeyWidth(part_left, col_box_right)); | |
| 160 if (tab_width_ok || (!part_width_ok)) { | |
| 161 // The tab is leaving the good column metric at least as good as | |
| 162 // it was before, so use the tab. | |
| 163 part->CopyRightTab(*col_part, false); | |
| 164 part->SetColumnGoodness(cb); | |
| 165 } else if (col_box_right > part_right && | |
| 166 (box_width_ok || !part_width_ok)) { | |
| 167 // The box is leaving the good column metric at least as good as | |
| 168 // it was before, so use the box. | |
| 169 part->CopyRightTab(*col_part, true); | |
| 170 part->SetColumnGoodness(cb); | |
| 171 } | |
| 172 } | |
| 173 } | |
| 174 } | |
| 175 ComputeCoverage(); | |
| 176 } | |
| 177 | |
| 178 // If this set is good enough to represent a new partitioning into columns, | |
| 179 // add it to the vector of sets, otherwise delete it. | |
| 180 void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, | |
| 181 const WidthCallback &cb) { | |
| 182 bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(), | |
| 183 bounding_box_.bottom()); | |
| 184 if (debug) { | |
| 185 tprintf("Considering new column candidate:\n"); | |
| 186 Print(); | |
| 187 } | |
| 188 if (!LegalColumnCandidate()) { | |
| 189 if (debug) { | |
| 190 tprintf("Not a legal column candidate:\n"); | |
| 191 Print(); | |
| 192 } | |
| 193 delete this; | |
| 194 return; | |
| 195 } | |
| 196 for (unsigned i = 0; i < column_sets->size(); ++i) { | |
| 197 ColPartitionSet *columns = column_sets->at(i); | |
| 198 // In ordering the column set candidates, good_coverage_ is king, | |
| 199 // followed by good_column_count_ and then bad_coverage_. | |
| 200 bool better = good_coverage_ > columns->good_coverage_; | |
| 201 if (good_coverage_ == columns->good_coverage_) { | |
| 202 better = good_column_count_ > columns->good_column_count_; | |
| 203 if (good_column_count_ == columns->good_column_count_) { | |
| 204 better = bad_coverage_ > columns->bad_coverage_; | |
| 205 } | |
| 206 } | |
| 207 if (better) { | |
| 208 // The new one is better so add it. | |
| 209 if (debug) { | |
| 210 tprintf("Good one\n"); | |
| 211 } | |
| 212 column_sets->insert(column_sets->begin() + i, this); | |
| 213 return; | |
| 214 } | |
| 215 if (columns->CompatibleColumns(false, this, cb)) { | |
| 216 if (debug) { | |
| 217 tprintf("Duplicate\n"); | |
| 218 } | |
| 219 delete this; | |
| 220 return; // It is not unique. | |
| 221 } | |
| 222 } | |
| 223 if (debug) { | |
| 224 tprintf("Added to end\n"); | |
| 225 } | |
| 226 column_sets->push_back(this); | |
| 227 } | |
| 228 | |
| 229 // Return true if the partitions in other are all compatible with the columns | |
| 230 // in this. | |
| 231 bool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet *other, | |
| 232 const WidthCallback &cb) { | |
| 233 if (debug) { | |
| 234 tprintf("CompatibleColumns testing compatibility\n"); | |
| 235 Print(); | |
| 236 other->Print(); | |
| 237 } | |
| 238 if (other->parts_.empty()) { | |
| 239 if (debug) { | |
| 240 tprintf("CompatibleColumns true due to empty other\n"); | |
| 241 } | |
| 242 return true; | |
| 243 } | |
| 244 ColPartition_IT it(&other->parts_); | |
| 245 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 246 ColPartition *part = it.data(); | |
| 247 if (part->blob_type() < BRT_UNKNOWN) { | |
| 248 if (debug) { | |
| 249 tprintf("CompatibleColumns ignoring image partition\n"); | |
| 250 part->Print(); | |
| 251 } | |
| 252 continue; // Image partitions are irrelevant to column compatibility. | |
| 253 } | |
| 254 int y = part->MidY(); | |
| 255 int left = part->bounding_box().left(); | |
| 256 int right = part->bounding_box().right(); | |
| 257 ColPartition *left_col = ColumnContaining(left, y); | |
| 258 ColPartition *right_col = ColumnContaining(right, y); | |
| 259 if (right_col == nullptr || left_col == nullptr) { | |
| 260 if (debug) { | |
| 261 tprintf("CompatibleColumns false due to partition edge outside\n"); | |
| 262 part->Print(); | |
| 263 } | |
| 264 return false; // A partition edge lies outside of all columns | |
| 265 } | |
| 266 if (right_col != left_col && cb(right - left)) { | |
| 267 if (debug) { | |
| 268 tprintf("CompatibleColumns false due to good width in multiple cols\n"); | |
| 269 part->Print(); | |
| 270 } | |
| 271 return false; // Partition with a good width must be in a single column. | |
| 272 } | |
| 273 | |
| 274 ColPartition_IT it2 = it; | |
| 275 while (!it2.at_last()) { | |
| 276 it2.forward(); | |
| 277 ColPartition *next_part = it2.data(); | |
| 278 if (!BLOBNBOX::IsTextType(next_part->blob_type())) { | |
| 279 continue; // Non-text partitions are irrelevant. | |
| 280 } | |
| 281 int next_left = next_part->bounding_box().left(); | |
| 282 if (next_left == right) { | |
| 283 break; // They share the same edge, so one must be a pull-out. | |
| 284 } | |
| 285 // Search to see if right and next_left fall within a single column. | |
| 286 ColPartition *next_left_col = ColumnContaining(next_left, y); | |
| 287 if (right_col == next_left_col) { | |
| 288 // There is a column break in this column. | |
| 289 // This can be due to a figure caption within a column, a pull-out | |
| 290 // block, or a simple broken textline that remains to be merged: | |
| 291 // all allowed, or a change in column layout: not allowed. | |
| 292 // If both partitions are of good width, then it is likely | |
| 293 // a change in column layout, otherwise probably an allowed situation. | |
| 294 if (part->good_width() && next_part->good_width()) { | |
| 295 if (debug) { | |
| 296 int next_right = next_part->bounding_box().right(); | |
| 297 tprintf("CompatibleColumns false due to 2 parts of good width\n"); | |
| 298 tprintf("part1 %d-%d, part2 %d-%d\n", left, right, next_left, | |
| 299 next_right); | |
| 300 right_col->Print(); | |
| 301 } | |
| 302 return false; | |
| 303 } | |
| 304 } | |
| 305 break; | |
| 306 } | |
| 307 } | |
| 308 if (debug) { | |
| 309 tprintf("CompatibleColumns true!\n"); | |
| 310 } | |
| 311 return true; | |
| 312 } | |
| 313 | |
| 314 // Returns the total width of all blobs in the part_set that do not lie | |
| 315 // within an approved column. Used as a cost measure for using this | |
| 316 // column set over another that might be compatible. | |
| 317 int ColPartitionSet::UnmatchedWidth(ColPartitionSet *part_set) { | |
| 318 int total_width = 0; | |
| 319 ColPartition_IT it(&part_set->parts_); | |
| 320 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 321 ColPartition *part = it.data(); | |
| 322 if (!BLOBNBOX::IsTextType(part->blob_type())) { | |
| 323 continue; // Non-text partitions are irrelevant to column compatibility. | |
| 324 } | |
| 325 int y = part->MidY(); | |
| 326 BLOBNBOX_C_IT box_it(part->boxes()); | |
| 327 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { | |
| 328 const TBOX &box = it.data()->bounding_box(); | |
| 329 // Assume that the whole blob is outside any column iff its x-middle | |
| 330 // is outside. | |
| 331 int x = (box.left() + box.right()) / 2; | |
| 332 ColPartition *col = ColumnContaining(x, y); | |
| 333 if (col == nullptr) { | |
| 334 total_width += box.width(); | |
| 335 } | |
| 336 } | |
| 337 } | |
| 338 return total_width; | |
| 339 } | |
| 340 | |
| 341 // Return true if this ColPartitionSet makes a legal column candidate by | |
| 342 // having legal individual partitions and non-overlapping adjacent pairs. | |
| 343 bool ColPartitionSet::LegalColumnCandidate() { | |
| 344 ColPartition_IT it(&parts_); | |
| 345 if (it.empty()) { | |
| 346 return false; | |
| 347 } | |
| 348 bool any_text_parts = false; | |
| 349 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 350 ColPartition *part = it.data(); | |
| 351 if (BLOBNBOX::IsTextType(part->blob_type())) { | |
| 352 if (!part->IsLegal()) { | |
| 353 return false; // Individual partition is illegal. | |
| 354 } | |
| 355 any_text_parts = true; | |
| 356 } | |
| 357 if (!it.at_last()) { | |
| 358 ColPartition *next_part = it.data_relative(1); | |
| 359 if (next_part->left_key() < part->right_key()) { | |
| 360 return false; | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 return any_text_parts; | |
| 365 } | |
| 366 | |
| 367 // Return a copy of this. If good_only will only copy the Good ColPartitions. | |
| 368 ColPartitionSet *ColPartitionSet::Copy(bool good_only) { | |
| 369 ColPartition_LIST copy_parts; | |
| 370 ColPartition_IT src_it(&parts_); | |
| 371 ColPartition_IT dest_it(©_parts); | |
| 372 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { | |
| 373 ColPartition *part = src_it.data(); | |
| 374 if (BLOBNBOX::IsTextType(part->blob_type()) && | |
| 375 (!good_only || part->good_width() || part->good_column())) { | |
| 376 dest_it.add_after_then_move(part->ShallowCopy()); | |
| 377 } | |
| 378 } | |
| 379 if (dest_it.empty()) { | |
| 380 return nullptr; | |
| 381 } | |
| 382 return new ColPartitionSet(©_parts); | |
| 383 } | |
| 384 | |
| 385 // Return the bounding boxes of columns at the given y-range | |
| 386 void ColPartitionSet::GetColumnBoxes(int y_bottom, int y_top, | |
| 387 ColSegment_LIST *segments) { | |
| 388 ColPartition_IT it(&parts_); | |
| 389 ColSegment_IT col_it(segments); | |
| 390 col_it.move_to_last(); | |
| 391 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 392 ColPartition *part = it.data(); | |
| 393 ICOORD bot_left(part->LeftAtY(y_top), y_bottom); | |
| 394 ICOORD top_right(part->RightAtY(y_bottom), y_top); | |
| 395 auto *col_seg = new ColSegment(); | |
| 396 col_seg->InsertBox(TBOX(bot_left, top_right)); | |
| 397 col_it.add_after_then_move(col_seg); | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 #ifndef GRAPHICS_DISABLED | |
| 402 | |
| 403 // Display the edges of the columns at the given y coords. | |
| 404 void ColPartitionSet::DisplayColumnEdges(int y_bottom, int y_top, | |
| 405 ScrollView *win) { | |
| 406 ColPartition_IT it(&parts_); | |
| 407 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 408 ColPartition *part = it.data(); | |
| 409 win->Line(part->LeftAtY(y_top), y_top, part->LeftAtY(y_bottom), y_bottom); | |
| 410 win->Line(part->RightAtY(y_top), y_top, part->RightAtY(y_bottom), y_bottom); | |
| 411 } | |
| 412 } | |
| 413 | |
| 414 #endif // !GRAPHICS_DISABLED | |
| 415 | |
| 416 // Return the ColumnSpanningType that best explains the columns overlapped | |
| 417 // by the given coords(left,right,y), with the given margins. | |
| 418 // Also return the first and last column index touched by the coords and | |
| 419 // the leftmost spanned column. | |
| 420 // Column indices are 2n + 1 for real columns (0 based) and even values | |
| 421 // represent the gaps in between columns, with 0 being left of the leftmost. | |
| 422 // resolution refers to the ppi resolution of the image. | |
| 423 ColumnSpanningType ColPartitionSet::SpanningType( | |
| 424 int resolution, int left, int right, int height, int y, int left_margin, | |
| 425 int right_margin, int *first_col, int *last_col, int *first_spanned_col) { | |
| 426 *first_col = -1; | |
| 427 *last_col = -1; | |
| 428 *first_spanned_col = -1; | |
| 429 int margin_columns = 0; | |
| 430 ColPartition_IT it(&parts_); | |
| 431 int col_index = 1; | |
| 432 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), col_index += 2) { | |
| 433 ColPartition *part = it.data(); | |
| 434 if (part->ColumnContains(left, y) || | |
| 435 (it.at_first() && part->ColumnContains(left + height, y))) { | |
| 436 // In the default case, first_col is set, but columns_spanned remains | |
| 437 // zero, so first_col will get reset in the first column genuinely | |
| 438 // spanned, but we can tell the difference from a noise partition | |
| 439 // that touches no column. | |
| 440 *first_col = col_index; | |
| 441 if (part->ColumnContains(right, y) || | |
| 442 (it.at_last() && part->ColumnContains(right - height, y))) { | |
| 443 // Both within a single column. | |
| 444 *last_col = col_index; | |
| 445 return CST_FLOWING; | |
| 446 } | |
| 447 if (left_margin <= part->LeftAtY(y)) { | |
| 448 // It completely spans this column. | |
| 449 *first_spanned_col = col_index; | |
| 450 margin_columns = 1; | |
| 451 } | |
| 452 } else if (part->ColumnContains(right, y) || | |
| 453 (it.at_last() && part->ColumnContains(right - height, y))) { | |
| 454 if (*first_col < 0) { | |
| 455 // It started in-between. | |
| 456 *first_col = col_index - 1; | |
| 457 } | |
| 458 if (right_margin >= part->RightAtY(y)) { | |
| 459 // It completely spans this column. | |
| 460 if (margin_columns == 0) { | |
| 461 *first_spanned_col = col_index; | |
| 462 } | |
| 463 ++margin_columns; | |
| 464 } | |
| 465 *last_col = col_index; | |
| 466 break; | |
| 467 } else if (left < part->LeftAtY(y) && right > part->RightAtY(y)) { | |
| 468 // Neither left nor right are contained within, so it spans this | |
| 469 // column. | |
| 470 if (*first_col < 0) { | |
| 471 // It started in between the previous column and the current column. | |
| 472 *first_col = col_index - 1; | |
| 473 } | |
| 474 if (margin_columns == 0) { | |
| 475 *first_spanned_col = col_index; | |
| 476 } | |
| 477 *last_col = col_index; | |
| 478 } else if (right < part->LeftAtY(y)) { | |
| 479 // We have gone past the end. | |
| 480 *last_col = col_index - 1; | |
| 481 if (*first_col < 0) { | |
| 482 // It must lie completely between columns =>noise. | |
| 483 *first_col = col_index - 1; | |
| 484 } | |
| 485 break; | |
| 486 } | |
| 487 } | |
| 488 if (*first_col < 0) { | |
| 489 *first_col = col_index - 1; // The last in-between. | |
| 490 } | |
| 491 if (*last_col < 0) { | |
| 492 *last_col = col_index - 1; // The last in-between. | |
| 493 } | |
| 494 ASSERT_HOST(*first_col >= 0 && *last_col >= 0); | |
| 495 ASSERT_HOST(*first_col <= *last_col); | |
| 496 if (*first_col == *last_col && right - left < kMinColumnWidth * resolution) { | |
| 497 // Neither end was in a column, and it didn't span any, so it lies | |
| 498 // entirely between columns, therefore noise. | |
| 499 return CST_NOISE; | |
| 500 } else if (margin_columns <= 1) { | |
| 501 // An exception for headings that stick outside of single-column text. | |
| 502 if (margin_columns == 1 && parts_.singleton()) { | |
| 503 return CST_HEADING; | |
| 504 } | |
| 505 // It is a pullout, as left and right were not in the same column, but | |
| 506 // it doesn't go to the edge of its start and end. | |
| 507 return CST_PULLOUT; | |
| 508 } | |
| 509 // Its margins went to the edges of first and last columns => heading. | |
| 510 return CST_HEADING; | |
| 511 } | |
| 512 | |
| 513 // The column_set has changed. Close down all in-progress WorkingPartSets in | |
| 514 // columns that do not match and start new ones for the new columns in this. | |
| 515 // As ColPartitions are turned into BLOCKs, the used ones are put in | |
| 516 // used_parts, as they still need to be referenced in the grid. | |
| 517 void ColPartitionSet::ChangeWorkColumns(const ICOORD &bleft, | |
| 518 const ICOORD &tright, int resolution, | |
| 519 ColPartition_LIST *used_parts, | |
| 520 WorkingPartSet_LIST *working_set_list) { | |
| 521 // Move the input list to a temporary location so we can delete its elements | |
| 522 // as we add them to the output working_set. | |
| 523 WorkingPartSet_LIST work_src; | |
| 524 WorkingPartSet_IT src_it(&work_src); | |
| 525 src_it.add_list_after(working_set_list); | |
| 526 src_it.move_to_first(); | |
| 527 WorkingPartSet_IT dest_it(working_set_list); | |
| 528 // Completed blocks and to_blocks are accumulated and given to the first new | |
| 529 // one whenever we keep a column, or at the end. | |
| 530 BLOCK_LIST completed_blocks; | |
| 531 TO_BLOCK_LIST to_blocks; | |
| 532 WorkingPartSet *first_new_set = nullptr; | |
| 533 WorkingPartSet *working_set = nullptr; | |
| 534 ColPartition_IT col_it(&parts_); | |
| 535 for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) { | |
| 536 ColPartition *column = col_it.data(); | |
| 537 // Any existing column to the left of column is completed. | |
| 538 while (!src_it.empty() && | |
| 539 ((working_set = src_it.data())->column() == nullptr || | |
| 540 working_set->column()->right_key() <= column->left_key())) { | |
| 541 src_it.extract(); | |
| 542 working_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts, | |
| 543 &completed_blocks, &to_blocks); | |
| 544 delete working_set; | |
| 545 src_it.forward(); | |
| 546 } | |
| 547 // Make a new between-column WorkingSet for before the current column. | |
| 548 working_set = new WorkingPartSet(nullptr); | |
| 549 dest_it.add_after_then_move(working_set); | |
| 550 if (first_new_set == nullptr) { | |
| 551 first_new_set = working_set; | |
| 552 } | |
| 553 // A matching column gets to stay, and first_new_set gets all the | |
| 554 // completed_sets. | |
| 555 working_set = src_it.empty() ? nullptr : src_it.data(); | |
| 556 if (working_set != nullptr && | |
| 557 working_set->column()->MatchingColumns(*column)) { | |
| 558 working_set->set_column(column); | |
| 559 dest_it.add_after_then_move(src_it.extract()); | |
| 560 src_it.forward(); | |
| 561 first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks); | |
| 562 first_new_set = nullptr; | |
| 563 } else { | |
| 564 // Just make a new working set for the current column. | |
| 565 working_set = new WorkingPartSet(column); | |
| 566 dest_it.add_after_then_move(working_set); | |
| 567 } | |
| 568 } | |
| 569 // Complete any remaining src working sets. | |
| 570 while (!src_it.empty()) { | |
| 571 working_set = src_it.extract(); | |
| 572 working_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts, | |
| 573 &completed_blocks, &to_blocks); | |
| 574 delete working_set; | |
| 575 src_it.forward(); | |
| 576 } | |
| 577 // Make a new between-column WorkingSet for after the last column. | |
| 578 working_set = new WorkingPartSet(nullptr); | |
| 579 dest_it.add_after_then_move(working_set); | |
| 580 if (first_new_set == nullptr) { | |
| 581 first_new_set = working_set; | |
| 582 } | |
| 583 // The first_new_set now gets any accumulated completed_parts/blocks. | |
| 584 first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks); | |
| 585 } | |
| 586 | |
| 587 // Accumulate the widths and gaps into the given variables. | |
| 588 void ColPartitionSet::AccumulateColumnWidthsAndGaps(int *total_width, | |
| 589 int *width_samples, | |
| 590 int *total_gap, | |
| 591 int *gap_samples) { | |
| 592 ColPartition_IT it(&parts_); | |
| 593 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 594 ColPartition *part = it.data(); | |
| 595 *total_width += part->ColumnWidth(); | |
| 596 ++*width_samples; | |
| 597 if (!it.at_last()) { | |
| 598 ColPartition *next_part = it.data_relative(1); | |
| 599 int part_left = part->right_key(); | |
| 600 int part_right = next_part->left_key(); | |
| 601 int gap = part->KeyWidth(part_left, part_right); | |
| 602 *total_gap += gap; | |
| 603 ++*gap_samples; | |
| 604 } | |
| 605 } | |
| 606 } | |
| 607 | |
| 608 // Provide debug output for this ColPartitionSet and all the ColPartitions. | |
| 609 void ColPartitionSet::Print() { | |
| 610 ColPartition_IT it(&parts_); | |
| 611 tprintf( | |
| 612 "Partition set of %d parts, %d good, coverage=%d+%d" | |
| 613 " (%d,%d)->(%d,%d)\n", | |
| 614 it.length(), good_column_count_, good_coverage_, bad_coverage_, | |
| 615 bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(), | |
| 616 bounding_box_.top()); | |
| 617 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 618 ColPartition *part = it.data(); | |
| 619 part->Print(); | |
| 620 } | |
| 621 } | |
| 622 | |
| 623 // PRIVATE CODE. | |
| 624 | |
| 625 // Add the given partition to the list in the appropriate place. | |
| 626 void ColPartitionSet::AddPartition(ColPartition *new_part, | |
| 627 ColPartition_IT *it) { | |
| 628 AddPartitionCoverageAndBox(*new_part); | |
| 629 int new_right = new_part->right_key(); | |
| 630 if (it->data()->left_key() >= new_right) { | |
| 631 it->add_before_stay_put(new_part); | |
| 632 } else { | |
| 633 it->add_after_stay_put(new_part); | |
| 634 } | |
| 635 } | |
| 636 | |
| 637 // Compute the coverage and good column count. Coverage is the amount of the | |
| 638 // width of the page (in pixels) that is covered by ColPartitions, which are | |
| 639 // used to provide candidate column layouts. | |
| 640 // Coverage is split into good and bad. Good coverage is provided by | |
| 641 // ColPartitions of a frequent width (according to the callback function | |
| 642 // provided by TabFinder::WidthCB, which accesses stored statistics on the | |
| 643 // widths of ColPartitions) and bad coverage is provided by all other | |
| 644 // ColPartitions, even if they have tab vectors at both sides. Thus: | |
| 645 // |-----------------------------------------------------------------| | |
| 646 // | Double width heading | | |
| 647 // |-----------------------------------------------------------------| | |
| 648 // |-------------------------------| |-------------------------------| | |
| 649 // | Common width ColPartition | | Common width ColPartition | | |
| 650 // |-------------------------------| |-------------------------------| | |
| 651 // the layout with two common-width columns has better coverage than the | |
| 652 // double width heading, because the coverage is "good," even though less in | |
| 653 // total coverage than the heading, because the heading coverage is "bad." | |
| 654 void ColPartitionSet::ComputeCoverage() { | |
| 655 // Count the number of good columns and sum their width. | |
| 656 ColPartition_IT it(&parts_); | |
| 657 good_column_count_ = 0; | |
| 658 good_coverage_ = 0; | |
| 659 bad_coverage_ = 0; | |
| 660 bounding_box_ = TBOX(); | |
| 661 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 662 ColPartition *part = it.data(); | |
| 663 AddPartitionCoverageAndBox(*part); | |
| 664 } | |
| 665 } | |
| 666 | |
| 667 // Adds the coverage, column count and box for a single partition, | |
| 668 // without adding it to the list. (Helper factored from ComputeCoverage.) | |
| 669 void ColPartitionSet::AddPartitionCoverageAndBox(const ColPartition &part) { | |
| 670 bounding_box_ += part.bounding_box(); | |
| 671 int coverage = part.ColumnWidth(); | |
| 672 if (part.good_width()) { | |
| 673 good_coverage_ += coverage; | |
| 674 good_column_count_ += 2; | |
| 675 } else { | |
| 676 if (part.blob_type() < BRT_UNKNOWN) { | |
| 677 coverage /= 2; | |
| 678 } | |
| 679 if (part.good_column()) { | |
| 680 ++good_column_count_; | |
| 681 } | |
| 682 bad_coverage_ += coverage; | |
| 683 } | |
| 684 } | |
| 685 | |
| 686 } // namespace tesseract. |
