comparison mupdf-source/thirdparty/tesseract/src/ccmain/pageiterator.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: pageiterator.cpp
3 // Description: Iterator for tesseract page structure that avoids using
4 // tesseract internal data structures.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2010, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #include <allheaders.h>
21 #include <tesseract/pageiterator.h>
22 #include "helpers.h"
23 #include "pageres.h"
24 #include "tesseractclass.h"
25
26 #include <algorithm>
27
28 namespace tesseract {
29
30 PageIterator::PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
31 int scaled_yres, int rect_left, int rect_top,
32 int rect_width, int rect_height)
33 : page_res_(page_res),
34 tesseract_(tesseract),
35 word_(nullptr),
36 word_length_(0),
37 blob_index_(0),
38 cblob_it_(nullptr),
39 include_upper_dots_(false),
40 include_lower_dots_(false),
41 scale_(scale),
42 scaled_yres_(scaled_yres),
43 rect_left_(rect_left),
44 rect_top_(rect_top),
45 rect_width_(rect_width),
46 rect_height_(rect_height) {
47 it_ = new PAGE_RES_IT(page_res);
48 PageIterator::Begin();
49 }
50
51 PageIterator::~PageIterator() {
52 delete it_;
53 delete cblob_it_;
54 }
55
56 /**
57 * PageIterators may be copied! This makes it possible to iterate over
58 * all the objects at a lower level, while maintaining an iterator to
59 * objects at a higher level.
60 */
61 PageIterator::PageIterator(const PageIterator &src)
62 : page_res_(src.page_res_),
63 tesseract_(src.tesseract_),
64 word_(nullptr),
65 word_length_(src.word_length_),
66 blob_index_(src.blob_index_),
67 cblob_it_(nullptr),
68 include_upper_dots_(src.include_upper_dots_),
69 include_lower_dots_(src.include_lower_dots_),
70 scale_(src.scale_),
71 scaled_yres_(src.scaled_yres_),
72 rect_left_(src.rect_left_),
73 rect_top_(src.rect_top_),
74 rect_width_(src.rect_width_),
75 rect_height_(src.rect_height_) {
76 it_ = new PAGE_RES_IT(*src.it_);
77 BeginWord(src.blob_index_);
78 }
79
80 const PageIterator &PageIterator::operator=(const PageIterator &src) {
81 page_res_ = src.page_res_;
82 tesseract_ = src.tesseract_;
83 include_upper_dots_ = src.include_upper_dots_;
84 include_lower_dots_ = src.include_lower_dots_;
85 scale_ = src.scale_;
86 scaled_yres_ = src.scaled_yres_;
87 rect_left_ = src.rect_left_;
88 rect_top_ = src.rect_top_;
89 rect_width_ = src.rect_width_;
90 rect_height_ = src.rect_height_;
91 delete it_;
92 it_ = new PAGE_RES_IT(*src.it_);
93 BeginWord(src.blob_index_);
94 return *this;
95 }
96
97 bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT *other) const {
98 return (it_ == nullptr && it_ == other) ||
99 ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
100 }
101
102 // ============= Moving around within the page ============.
103
104 /** Resets the iterator to point to the start of the page. */
105 void PageIterator::Begin() {
106 it_->restart_page_with_empties();
107 BeginWord(0);
108 }
109
110 void PageIterator::RestartParagraph() {
111 if (it_->block() == nullptr) {
112 return; // At end of the document.
113 }
114 PAGE_RES_IT para(page_res_);
115 PAGE_RES_IT next_para(para);
116 next_para.forward_paragraph();
117 while (next_para.cmp(*it_) <= 0) {
118 para = next_para;
119 next_para.forward_paragraph();
120 }
121 *it_ = para;
122 BeginWord(0);
123 }
124
125 bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
126 PageIterator p_start(*this);
127 p_start.RestartParagraph();
128 return p_start.it_->row() == it_->row();
129 }
130
131 void PageIterator::RestartRow() {
132 it_->restart_row();
133 BeginWord(0);
134 }
135
136 /**
137 * Moves to the start of the next object at the given level in the
138 * page hierarchy, and returns false if the end of the page was reached.
139 * NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
140 * non-text block at least once.
141 * Think of non text blocks as containing a single para, with at least one
142 * line, with a single imaginary word, containing a single symbol.
143 * The bounding boxes mark out any polygonal nature of the block, and
144 * PTIsTextType(BLockType()) is false for non-text blocks.
145 * Calls to Next with different levels may be freely intermixed.
146 * This function iterates words in right-to-left scripts correctly, if
147 * the appropriate language has been loaded into Tesseract.
148 */
149 bool PageIterator::Next(PageIteratorLevel level) {
150 if (it_->block() == nullptr) {
151 return false; // Already at the end!
152 }
153 if (it_->word() == nullptr) {
154 level = RIL_BLOCK;
155 }
156
157 switch (level) {
158 case RIL_BLOCK:
159 it_->forward_block();
160 break;
161 case RIL_PARA:
162 it_->forward_paragraph();
163 break;
164 case RIL_TEXTLINE:
165 for (it_->forward_with_empties(); it_->row() == it_->prev_row();
166 it_->forward_with_empties()) {
167 ;
168 }
169 break;
170 case RIL_WORD:
171 it_->forward_with_empties();
172 break;
173 case RIL_SYMBOL:
174 if (cblob_it_ != nullptr) {
175 cblob_it_->forward();
176 }
177 ++blob_index_;
178 if (blob_index_ >= word_length_) {
179 it_->forward_with_empties();
180 } else {
181 return true;
182 }
183 break;
184 }
185 BeginWord(0);
186 return it_->block() != nullptr;
187 }
188
189 /**
190 * Returns true if the iterator is at the start of an object at the given
191 * level. Possible uses include determining if a call to Next(RIL_WORD)
192 * moved to the start of a RIL_PARA.
193 */
194 bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
195 if (it_->block() == nullptr) {
196 return false; // Already at the end!
197 }
198 if (it_->word() == nullptr) {
199 return true; // In an image block.
200 }
201 switch (level) {
202 case RIL_BLOCK:
203 return blob_index_ == 0 && it_->block() != it_->prev_block();
204 case RIL_PARA:
205 return blob_index_ == 0 &&
206 (it_->block() != it_->prev_block() ||
207 it_->row()->row->para() != it_->prev_row()->row->para());
208 case RIL_TEXTLINE:
209 return blob_index_ == 0 && it_->row() != it_->prev_row();
210 case RIL_WORD:
211 return blob_index_ == 0;
212 case RIL_SYMBOL:
213 return true;
214 }
215 return false;
216 }
217
218 /**
219 * Returns whether the iterator is positioned at the last element in a
220 * given level. (e.g. the last word in a line, the last line in a block)
221 */
222 bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
223 PageIteratorLevel element) const {
224 if (Empty(element)) {
225 return true; // Already at the end!
226 }
227 // The result is true if we step forward by element and find we are
228 // at the end of the page or at beginning of *all* levels in:
229 // [level, element).
230 // When there is more than one level difference between element and level,
231 // we could for instance move forward one symbol and still be at the first
232 // word on a line, so we also have to be at the first symbol in a word.
233 PageIterator next(*this);
234 next.Next(element);
235 if (next.Empty(element)) {
236 return true; // Reached the end of the page.
237 }
238 while (element > level) {
239 element = static_cast<PageIteratorLevel>(element - 1);
240 if (!next.IsAtBeginningOf(element)) {
241 return false;
242 }
243 }
244 return true;
245 }
246
247 /**
248 * Returns whether this iterator is positioned
249 * before other: -1
250 * equal to other: 0
251 * after other: 1
252 */
253 int PageIterator::Cmp(const PageIterator &other) const {
254 int word_cmp = it_->cmp(*other.it_);
255 if (word_cmp != 0) {
256 return word_cmp;
257 }
258 if (blob_index_ < other.blob_index_) {
259 return -1;
260 }
261 if (blob_index_ == other.blob_index_) {
262 return 0;
263 }
264 return 1;
265 }
266
267 // ============= Accessing data ==============.
268 // Coordinate system:
269 // Integer coordinates are at the cracks between the pixels.
270 // The top-left corner of the top-left pixel in the image is at (0,0).
271 // The bottom-right corner of the bottom-right pixel in the image is at
272 // (width, height).
273 // Every bounding box goes from the top-left of the top-left contained
274 // pixel to the bottom-right of the bottom-right contained pixel, so
275 // the bounding box of the single top-left pixel in the image is:
276 // (0,0)->(1,1).
277 // If an image rectangle has been set in the API, then returned coordinates
278 // relate to the original (full) image, rather than the rectangle.
279
280 /**
281 * Returns the bounding rectangle of the current object at the given level in
282 * the coordinates of the working image that is pix_binary().
283 * See comment on coordinate system above.
284 * Returns false if there is no such object at the current position.
285 */
286 bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, int *left,
287 int *top, int *right,
288 int *bottom) const {
289 if (Empty(level)) {
290 return false;
291 }
292 TBOX box;
293 PARA *para = nullptr;
294 switch (level) {
295 case RIL_BLOCK:
296 box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
297 include_lower_dots_);
298 break;
299 case RIL_PARA:
300 para = it_->row()->row->para();
301 // Fall through.
302 case RIL_TEXTLINE:
303 box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
304 include_lower_dots_);
305 break;
306 case RIL_WORD:
307 box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
308 include_lower_dots_);
309 break;
310 case RIL_SYMBOL:
311 if (cblob_it_ == nullptr) {
312 box = it_->word()->box_word->BlobBox(blob_index_);
313 } else {
314 box = cblob_it_->data()->bounding_box();
315 }
316 }
317 if (level == RIL_PARA) {
318 PageIterator other = *this;
319 other.Begin();
320 do {
321 if (other.it_->block() &&
322 other.it_->block()->block == it_->block()->block &&
323 other.it_->row() && other.it_->row()->row &&
324 other.it_->row()->row->para() == para) {
325 box = box.bounding_union(other.it_->row()->row->bounding_box());
326 }
327 } while (other.Next(RIL_TEXTLINE));
328 }
329 if (level != RIL_SYMBOL || cblob_it_ != nullptr) {
330 box.rotate(it_->block()->block->re_rotation());
331 }
332 // Now we have a box in tesseract coordinates relative to the image rectangle,
333 // we have to convert the coords to a top-down system.
334 const int pix_height = pixGetHeight(tesseract_->pix_binary());
335 const int pix_width = pixGetWidth(tesseract_->pix_binary());
336 *left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
337 *top = ClipToRange(pix_height - box.top(), 0, pix_height);
338 *right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
339 *bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
340 return true;
341 }
342
343 /**
344 * Returns the bounding rectangle of the current object at the given level in
345 * coordinates of the original image.
346 * See comment on coordinate system above.
347 * Returns false if there is no such object at the current position.
348 */
349 bool PageIterator::BoundingBox(PageIteratorLevel level, int *left, int *top,
350 int *right, int *bottom) const {
351 return BoundingBox(level, 0, left, top, right, bottom);
352 }
353
354 bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
355 int *left, int *top, int *right,
356 int *bottom) const {
357 if (!BoundingBoxInternal(level, left, top, right, bottom)) {
358 return false;
359 }
360 // Convert to the coordinate system of the original image.
361 *left = ClipToRange(*left / scale_ + rect_left_ - padding, rect_left_,
362 rect_left_ + rect_width_);
363 *top = ClipToRange(*top / scale_ + rect_top_ - padding, rect_top_,
364 rect_top_ + rect_height_);
365 *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
366 *left, rect_left_ + rect_width_);
367 *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
368 *top, rect_top_ + rect_height_);
369 return true;
370 }
371
372 /** Return that there is no such object at a given level. */
373 bool PageIterator::Empty(PageIteratorLevel level) const {
374 if (it_->block() == nullptr) {
375 return true; // Already at the end!
376 }
377 if (it_->word() == nullptr && level != RIL_BLOCK) {
378 return true; // image block
379 }
380 if (level == RIL_SYMBOL && blob_index_ >= word_length_) {
381 return true; // Zero length word, or already at the end of it.
382 }
383 return false;
384 }
385
386 /** Returns the type of the current block.
387 * See tesseract/publictypes.h for PolyBlockType. */
388 PolyBlockType PageIterator::BlockType() const {
389 if (it_->block() == nullptr || it_->block()->block == nullptr) {
390 return PT_UNKNOWN; // Already at the end!
391 }
392 if (it_->block()->block->pdblk.poly_block() == nullptr) {
393 return PT_FLOWING_TEXT; // No layout analysis used - assume text.
394 }
395 return it_->block()->block->pdblk.poly_block()->isA();
396 }
397
398 /** Returns the polygon outline of the current block. The returned Pta must
399 * be ptaDestroy-ed after use. */
400 Pta *PageIterator::BlockPolygon() const {
401 if (it_->block() == nullptr || it_->block()->block == nullptr) {
402 return nullptr; // Already at the end!
403 }
404 if (it_->block()->block->pdblk.poly_block() == nullptr) {
405 return nullptr; // No layout analysis used - no polygon.
406 }
407 // Copy polygon, so we can unrotate it to image coordinates.
408 POLY_BLOCK *internal_poly = it_->block()->block->pdblk.poly_block();
409 ICOORDELT_LIST vertices;
410 vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
411 POLY_BLOCK poly(&vertices, internal_poly->isA());
412 poly.rotate(it_->block()->block->re_rotation());
413 ICOORDELT_IT it(poly.points());
414 Pta *pta = ptaCreate(it.length());
415 int num_pts = 0;
416 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
417 ICOORD *pt = it.data();
418 // Convert to top-down coords within the input image.
419 int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
420 int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
421 x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);
422 y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);
423 ptaAddPt(pta, x, y);
424 }
425 return pta;
426 }
427
428 /**
429 * Returns a binary image of the current object at the given level.
430 * The position and size match the return from BoundingBoxInternal, and so this
431 * could be upscaled with respect to the original input image.
432 * Use pixDestroy to delete the image after use.
433 * The following methods are used to generate the images:
434 * RIL_BLOCK: mask the page image with the block polygon.
435 * RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
436 * TODO(rays) fix this to generate and use a line polygon.
437 * RIL_WORD: Clip the rectangle of the word box from the page image.
438 * RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
439 * to recognition) or the bounding box otherwise.
440 * A reconstruction of the original image (using xor to check for double
441 * representation) should be reasonably accurate,
442 * apart from removed noise, at the block level. Below the block level, the
443 * reconstruction will be missing images and line separators.
444 * At the symbol level, kerned characters will be invade the bounding box
445 * if rendered after recognition, making an xor reconstruction inaccurate, but
446 * an or construction better. Before recognition, symbol-level reconstruction
447 * should be good, even with xor, since the images come from the connected
448 * components.
449 */
450 Pix *PageIterator::GetBinaryImage(PageIteratorLevel level) const {
451 int left, top, right, bottom;
452 if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) {
453 return nullptr;
454 }
455 if (level == RIL_SYMBOL && cblob_it_ != nullptr &&
456 cblob_it_->data()->area() != 0) {
457 return cblob_it_->data()->render();
458 }
459 Box *box = boxCreate(left, top, right - left, bottom - top);
460 Image pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
461 boxDestroy(&box);
462 if (level == RIL_BLOCK || level == RIL_PARA) {
463 // Clip to the block polygon as well.
464 TBOX mask_box;
465 Image mask = it_->block()->block->render_mask(&mask_box);
466 int mask_x = left - mask_box.left();
467 int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
468 // AND the mask and pix, putting the result in pix.
469 pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y),
470 pixGetWidth(pix), pixGetHeight(pix), PIX_SRC & PIX_DST, mask,
471 std::max(0, mask_x), std::max(0, mask_y));
472 mask.destroy();
473 }
474 return pix;
475 }
476
477 /**
478 * Returns an image of the current object at the given level in greyscale
479 * if available in the input. To guarantee a binary image use BinaryImage.
480 * NOTE that in order to give the best possible image, the bounds are
481 * expanded slightly over the binary connected component, by the supplied
482 * padding, so the top-left position of the returned image is returned
483 * in (left,top). These will most likely not match the coordinates
484 * returned by BoundingBox.
485 * If you do not supply an original image, you will get a binary one.
486 * Use pixDestroy to delete the image after use.
487 */
488 Pix *PageIterator::GetImage(PageIteratorLevel level, int padding,
489 Pix *original_img, int *left, int *top) const {
490 int right, bottom;
491 if (!BoundingBox(level, left, top, &right, &bottom)) {
492 return nullptr;
493 }
494 if (original_img == nullptr) {
495 return GetBinaryImage(level);
496 }
497
498 // Expand the box.
499 *left = std::max(*left - padding, 0);
500 *top = std::max(*top - padding, 0);
501 right = std::min(right + padding, rect_width_);
502 bottom = std::min(bottom + padding, rect_height_);
503 Box *box = boxCreate(*left, *top, right - *left, bottom - *top);
504 Image grey_pix = pixClipRectangle(original_img, box, nullptr);
505 boxDestroy(&box);
506 if (level == RIL_BLOCK || level == RIL_PARA) {
507 // Clip to the block polygon as well.
508 TBOX mask_box;
509 Image mask = it_->block()->block->render_mask(&mask_box);
510 // Copy the mask registered correctly into an image the size of grey_pix.
511 int mask_x = *left - mask_box.left();
512 int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
513 int width = pixGetWidth(grey_pix);
514 int height = pixGetHeight(grey_pix);
515 Image resized_mask = pixCreate(width, height, 1);
516 pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width,
517 height, PIX_SRC, mask, std::max(0, mask_x),
518 std::max(0, mask_y));
519 mask.destroy();
520 pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
521 2 * padding + 1);
522 pixInvert(resized_mask, resized_mask);
523 pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
524 resized_mask.destroy();
525 }
526 return grey_pix;
527 }
528
529 /**
530 * Returns the baseline of the current object at the given level.
531 * The baseline is the line that passes through (x1, y1) and (x2, y2).
532 * WARNING: with vertical text, baselines may be vertical!
533 */
534 bool PageIterator::Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
535 int *y2) const {
536 if (it_->word() == nullptr) {
537 return false; // Already at the end!
538 }
539 ROW *row = it_->row()->row;
540 WERD *word = it_->word()->word;
541 TBOX box = (level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box()
542 : row->bounding_box();
543 int left = box.left();
544 ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
545 int right = box.right();
546 ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));
547 // Rotate to image coordinates and convert to global image coords.
548 startpt.rotate(it_->block()->block->re_rotation());
549 endpt.rotate(it_->block()->block->re_rotation());
550 *x1 = startpt.x() / scale_ + rect_left_;
551 *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
552 *x2 = endpt.x() / scale_ + rect_left_;
553 *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
554 return true;
555 }
556
557 void PageIterator::RowAttributes(float *row_height, float *descenders,
558 float *ascenders) const {
559 *row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() -
560 it_->row()->row->descenders();
561 *descenders = it_->row()->row->descenders();
562 *ascenders = it_->row()->row->ascenders();
563 }
564
565 void PageIterator::Orientation(tesseract::Orientation *orientation,
566 tesseract::WritingDirection *writing_direction,
567 tesseract::TextlineOrder *textline_order,
568 float *deskew_angle) const {
569 auto *block_res = it_->block();
570 if (block_res == nullptr) {
571 // Nothing can be done, so return default values.
572 *orientation = ORIENTATION_PAGE_UP;
573 *writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
574 *textline_order = TEXTLINE_ORDER_TOP_TO_BOTTOM;
575 return;
576 }
577 auto *block = block_res->block;
578
579 // Orientation
580 FCOORD up_in_image(0.0, 1.0);
581 up_in_image.unrotate(block->classify_rotation());
582 up_in_image.rotate(block->re_rotation());
583
584 if (up_in_image.x() == 0.0F) {
585 if (up_in_image.y() > 0.0F) {
586 *orientation = ORIENTATION_PAGE_UP;
587 } else {
588 *orientation = ORIENTATION_PAGE_DOWN;
589 }
590 } else if (up_in_image.x() > 0.0F) {
591 *orientation = ORIENTATION_PAGE_RIGHT;
592 } else {
593 *orientation = ORIENTATION_PAGE_LEFT;
594 }
595
596 // Writing direction
597 bool is_vertical_text = (block->classify_rotation().x() == 0.0);
598 bool right_to_left = block->right_to_left();
599 *writing_direction = is_vertical_text
600 ? WRITING_DIRECTION_TOP_TO_BOTTOM
601 : (right_to_left ? WRITING_DIRECTION_RIGHT_TO_LEFT
602 : WRITING_DIRECTION_LEFT_TO_RIGHT);
603
604 // Textline Order
605 const bool is_mongolian = false; // TODO(eger): fix me
606 *textline_order = is_vertical_text
607 ? (is_mongolian ? TEXTLINE_ORDER_LEFT_TO_RIGHT
608 : TEXTLINE_ORDER_RIGHT_TO_LEFT)
609 : TEXTLINE_ORDER_TOP_TO_BOTTOM;
610
611 // Deskew angle
612 FCOORD skew = block->skew(); // true horizontal for textlines
613 *deskew_angle = -skew.angle();
614 }
615
616 void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
617 bool *is_list_item, bool *is_crown,
618 int *first_line_indent) const {
619 *just = tesseract::JUSTIFICATION_UNKNOWN;
620 if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
621 !it_->row()->row->para()->model) {
622 return;
623 }
624
625 PARA *para = it_->row()->row->para();
626 *is_list_item = para->is_list_item;
627 *is_crown = para->is_very_first_or_continuation;
628 *first_line_indent = para->model->first_indent() - para->model->body_indent();
629 *just = para->model->justification();
630 }
631
632 /**
633 * Sets up the internal data for iterating the blobs of a new word, then
634 * moves the iterator to the given offset.
635 */
636 void PageIterator::BeginWord(int offset) {
637 WERD_RES *word_res = it_->word();
638 if (word_res == nullptr) {
639 // This is a non-text block, so there is no word.
640 word_length_ = 0;
641 blob_index_ = 0;
642 word_ = nullptr;
643 return;
644 }
645 if (word_res->best_choice != nullptr) {
646 // Recognition has been done, so we are using the box_word, which
647 // is already baseline denormalized.
648 word_length_ = word_res->best_choice->length();
649 if (word_res->box_word != nullptr) {
650 if (word_res->box_word->length() != static_cast<unsigned>(word_length_)) {
651 tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
652 word_length_, word_res->best_choice->unichar_string().c_str(),
653 word_res->box_word->length());
654 word_res->box_word->bounding_box().print();
655 }
656 ASSERT_HOST(word_res->box_word->length() ==
657 static_cast<unsigned>(word_length_));
658 }
659 word_ = nullptr;
660 // We will be iterating the box_word.
661 delete cblob_it_;
662 cblob_it_ = nullptr;
663 } else {
664 // No recognition yet, so a "symbol" is a cblob.
665 word_ = word_res->word;
666 ASSERT_HOST(word_->cblob_list() != nullptr);
667 word_length_ = word_->cblob_list()->length();
668 if (cblob_it_ == nullptr) {
669 cblob_it_ = new C_BLOB_IT;
670 }
671 cblob_it_->set_to_list(word_->cblob_list());
672 }
673 for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
674 if (cblob_it_ != nullptr) {
675 cblob_it_->forward();
676 }
677 }
678 }
679
680 bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
681 if (it_->word() != nullptr) {
682 it_->word()->blamer_bundle = blamer_bundle;
683 return true;
684 } else {
685 return false;
686 }
687 }
688
689 } // namespace tesseract.