comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ocrblock.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: ocrblock.cpp (Formerly block.c)
3 * Description: BLOCK member functions and iterator functions.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "ocrblock.h"
20
21 #include "stepblob.h"
22 #include "tprintf.h"
23
24 #include <cstdlib>
25 #include <memory> // std::unique_ptr
26
27 namespace tesseract {
28
29 /**
30 * BLOCK::BLOCK
31 *
32 * Constructor for a simple rectangular block.
33 */
34 BLOCK::BLOCK(const char *name, ///< filename
35 bool prop, ///< proportional
36 int16_t kern, ///< kerning
37 int16_t space, ///< spacing
38 TDimension xmin, ///< bottom left
39 TDimension ymin,
40 TDimension xmax, ///< top right
41 TDimension ymax)
42 : pdblk(xmin, ymin, xmax, ymax)
43 , filename(name)
44 , re_rotation_(1.0f, 0.0f)
45 , classify_rotation_(1.0f, 0.0f)
46 , skew_(1.0f, 0.0f) {
47 ICOORDELT_IT left_it = &pdblk.leftside;
48 ICOORDELT_IT right_it = &pdblk.rightside;
49
50 proportional = prop;
51 kerning = kern;
52 spacing = space;
53 font_class = -1; // not assigned
54 cell_over_xheight_ = 2.0f;
55 pdblk.hand_poly = nullptr;
56 left_it.set_to_list(&pdblk.leftside);
57 right_it.set_to_list(&pdblk.rightside);
58 // make default box
59 left_it.add_to_end(new ICOORDELT(xmin, ymin));
60 left_it.add_to_end(new ICOORDELT(xmin, ymax));
61 right_it.add_to_end(new ICOORDELT(xmax, ymin));
62 right_it.add_to_end(new ICOORDELT(xmax, ymax));
63 }
64
65 /**
66 * decreasing_top_order
67 *
68 * Sort Comparator: Return <0 if row1 top < row2 top
69 */
70
71 static int decreasing_top_order(const void *row1, const void *row2) {
72 return (*reinterpret_cast<ROW *const *>(row2))->bounding_box().top() -
73 (*reinterpret_cast<ROW *const *>(row1))->bounding_box().top();
74 }
75
76 /**
77 * BLOCK::rotate
78 *
79 * Rotate the polygon by the given rotation and recompute the bounding_box.
80 */
81 void BLOCK::rotate(const FCOORD &rotation) {
82 pdblk.poly_block()->rotate(rotation);
83 pdblk.box = *pdblk.poly_block()->bounding_box();
84 }
85
86 // Returns the bounding box including the desired combination of upper and
87 // lower noise/diacritic elements.
88 TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
89 TBOX box;
90 // This is a read-only iteration of the rows in the block.
91 ROW_IT it(const_cast<ROW_LIST *>(&rows));
92 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
93 box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
94 }
95 return box;
96 }
97
98 /**
99 * BLOCK::reflect_polygon_in_y_axis
100 *
101 * Reflects the polygon in the y-axis and recompute the bounding_box.
102 * Does nothing to any contained rows/words/blobs etc.
103 */
104 void BLOCK::reflect_polygon_in_y_axis() {
105 pdblk.poly_block()->reflect_in_y_axis();
106 pdblk.box = *pdblk.poly_block()->bounding_box();
107 }
108
109 /**
110 * BLOCK::sort_rows
111 *
112 * Order rows so that they are in order of decreasing Y coordinate
113 */
114
115 void BLOCK::sort_rows() { // order on "top"
116 ROW_IT row_it(&rows);
117
118 row_it.sort(decreasing_top_order);
119 }
120
121 /**
122 * BLOCK::compress
123 *
124 * Delete space between the rows. (And maybe one day, compress the rows)
125 * Fill space of block from top down, left aligning rows.
126 */
127
128 void BLOCK::compress() { // squash it up
129 #define ROW_SPACING 5
130
131 ROW_IT row_it(&rows);
132 ROW *row;
133 ICOORD row_spacing(0, ROW_SPACING);
134
135 ICOORDELT_IT icoordelt_it;
136
137 sort_rows();
138
139 pdblk.box = TBOX(pdblk.box.topleft(), pdblk.box.topleft());
140 pdblk.box.move_bottom_edge(ROW_SPACING);
141 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
142 row = row_it.data();
143 row->move(pdblk.box.botleft() - row_spacing - row->bounding_box().topleft());
144 pdblk.box += row->bounding_box();
145 }
146
147 pdblk.leftside.clear();
148 icoordelt_it.set_to_list(&pdblk.leftside);
149 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.bottom()));
150 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.top()));
151 pdblk.rightside.clear();
152 icoordelt_it.set_to_list(&pdblk.rightside);
153 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.bottom()));
154 icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.top()));
155 }
156
157 /**
158 * BLOCK::check_pitch
159 *
160 * Check whether the block is fixed or prop, set the flag, and set
161 * the pitch if it is fixed.
162 */
163
164 void BLOCK::check_pitch() { // check prop
165 // tprintf("Missing FFT fixed pitch stuff!\n");
166 pitch = -1;
167 }
168
169 /**
170 * BLOCK::compress
171 *
172 * Compress and move in a single operation.
173 */
174
175 void BLOCK::compress( // squash it up
176 const ICOORD vec // and move
177 ) {
178 pdblk.box.move(vec);
179 compress();
180 }
181
182 /**
183 * BLOCK::print
184 *
185 * Print the info on a block
186 */
187
188 void BLOCK::print( // print list of sides
189 FILE *, ///< file to print on
190 bool dump ///< print full detail
191 ) {
192 ICOORDELT_IT it = &pdblk.leftside; // iterator
193
194 pdblk.box.print();
195 tprintf("Proportional= %s\n", proportional ? "TRUE" : "FALSE");
196 tprintf("Kerning= %d\n", kerning);
197 tprintf("Spacing= %d\n", spacing);
198 tprintf("Fixed_pitch=%d\n", pitch);
199 tprintf("Filename= %s\n", filename.c_str());
200
201 if (dump) {
202 tprintf("Left side coords are:\n");
203 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
204 tprintf("(%d,%d) ", it.data()->x(), it.data()->y());
205 }
206 tprintf("\n");
207 tprintf("Right side coords are:\n");
208 it.set_to_list(&pdblk.rightside);
209 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
210 tprintf("(%d,%d) ", it.data()->x(), it.data()->y());
211 }
212 tprintf("\n");
213 }
214 }
215
216 /**
217 * BLOCK::operator=
218 *
219 * Assignment - duplicate the block structure, but with an EMPTY row list.
220 */
221
222 BLOCK &BLOCK::operator=( // assignment
223 const BLOCK &source // from this
224 ) {
225 this->ELIST_LINK::operator=(source);
226 pdblk = source.pdblk;
227 proportional = source.proportional;
228 kerning = source.kerning;
229 spacing = source.spacing;
230 filename = source.filename; // STRINGs assign ok
231 if (!rows.empty()) {
232 rows.clear();
233 }
234 re_rotation_ = source.re_rotation_;
235 classify_rotation_ = source.classify_rotation_;
236 skew_ = source.skew_;
237 return *this;
238 }
239
240 // This function is for finding the approximate (horizontal) distance from
241 // the x-coordinate of the left edge of a symbol to the left edge of the
242 // text block which contains it. We are passed:
243 // segments - output of PB_LINE_IT::get_line() which contains x-coordinate
244 // intervals for the scan line going through the symbol's y-coordinate.
245 // Each element of segments is of the form (x()=start_x, y()=length).
246 // x - the x coordinate of the symbol we're interested in.
247 // margin - return value, the distance from x,y to the left margin of the
248 // block containing it.
249 // If all segments were to the right of x, we return false and 0.
250 static bool LeftMargin(ICOORDELT_LIST *segments, int x, int *margin) {
251 bool found = false;
252 *margin = 0;
253 if (segments->empty()) {
254 return found;
255 }
256 ICOORDELT_IT seg_it(segments);
257 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
258 int cur_margin = x - seg_it.data()->x();
259 if (cur_margin >= 0) {
260 if (!found) {
261 *margin = cur_margin;
262 } else if (cur_margin < *margin) {
263 *margin = cur_margin;
264 }
265 found = true;
266 }
267 }
268 return found;
269 }
270
271 // This function is for finding the approximate (horizontal) distance from
272 // the x-coordinate of the right edge of a symbol to the right edge of the
273 // text block which contains it. We are passed:
274 // segments - output of PB_LINE_IT::get_line() which contains x-coordinate
275 // intervals for the scan line going through the symbol's y-coordinate.
276 // Each element of segments is of the form (x()=start_x, y()=length).
277 // x - the x coordinate of the symbol we're interested in.
278 // margin - return value, the distance from x,y to the right margin of the
279 // block containing it.
280 // If all segments were to the left of x, we return false and 0.
281 static bool RightMargin(ICOORDELT_LIST *segments, int x, int *margin) {
282 bool found = false;
283 *margin = 0;
284 if (segments->empty()) {
285 return found;
286 }
287 ICOORDELT_IT seg_it(segments);
288 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
289 int cur_margin = seg_it.data()->x() + seg_it.data()->y() - x;
290 if (cur_margin >= 0) {
291 if (!found) {
292 *margin = cur_margin;
293 } else if (cur_margin < *margin) {
294 *margin = cur_margin;
295 }
296 found = true;
297 }
298 }
299 return found;
300 }
301
302 // Compute the distance from the left and right ends of each row to the
303 // left and right edges of the block's polyblock. Illustration:
304 // ____________________________ _______________________
305 // | Howdy neighbor! | |rectangular blocks look|
306 // | This text is written to| |more like stacked pizza|
307 // |illustrate how useful poly- |boxes. |
308 // |blobs are in ----------- ------ The polyblob|
309 // |dealing with| _________ |for a BLOCK rec-|
310 // |harder layout| /===========\ |ords the possibly|
311 // |issues. | | _ _ | |skewed pseudo-|
312 // | You see this| | |_| \|_| | |rectangular |
313 // |text is flowed| | } | |boundary that|
314 // |around a mid-| \ ____ | |forms the ideal-|
315 // |column portrait._____ \ / __|ized text margin|
316 // | Polyblobs exist| \ / |from which we should|
317 // |to account for insets| | | |measure paragraph|
318 // |which make otherwise| ----- |indentation. |
319 // ----------------------- ----------------------
320 //
321 // If we identify a drop-cap, we measure the left margin for the lines
322 // below the first line relative to one space past the drop cap. The
323 // first line's margin and those past the drop cap area are measured
324 // relative to the enclosing polyblock.
325 //
326 // TODO(rays): Before this will work well, we'll need to adjust the
327 // polyblob tighter around the text near images, as in:
328 // UNLV_AUTO:mag.3G0 page 2
329 // UNLV_AUTO:mag.3G4 page 16
330 void BLOCK::compute_row_margins() {
331 if (row_list()->empty() || row_list()->singleton()) {
332 return;
333 }
334
335 // If Layout analysis was not called, default to this.
336 POLY_BLOCK rect_block(pdblk.bounding_box(), PT_FLOWING_TEXT);
337 POLY_BLOCK *pblock = &rect_block;
338 if (pdblk.poly_block() != nullptr) {
339 pblock = pdblk.poly_block();
340 }
341
342 // Step One: Determine if there is a drop-cap.
343 // TODO(eger): Fix up drop cap code for RTL languages.
344 ROW_IT r_it(row_list());
345 ROW *first_row = r_it.data();
346 ROW *second_row = r_it.data_relative(1);
347
348 // initialize the bottom of a fictitious drop cap far above the first line.
349 int drop_cap_bottom = first_row->bounding_box().top() + first_row->bounding_box().height();
350 int drop_cap_right = first_row->bounding_box().left();
351 int mid_second_line = second_row->bounding_box().top() - second_row->bounding_box().height() / 2;
352 WERD_IT werd_it(r_it.data()->word_list()); // words of line one
353 if (!werd_it.empty()) {
354 C_BLOB_IT cblob_it(werd_it.data()->cblob_list());
355 for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
356 TBOX bbox = cblob_it.data()->bounding_box();
357 if (bbox.bottom() <= mid_second_line) {
358 // we found a real drop cap
359 first_row->set_has_drop_cap(true);
360 if (drop_cap_bottom > bbox.bottom()) {
361 drop_cap_bottom = bbox.bottom();
362 }
363 if (drop_cap_right < bbox.right()) {
364 drop_cap_right = bbox.right();
365 }
366 }
367 }
368 }
369
370 // Step Two: Calculate the margin from the text of each row to the block
371 // (or drop-cap) boundaries.
372 PB_LINE_IT lines(pblock);
373 r_it.set_to_list(row_list());
374 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
375 ROW *row = r_it.data();
376 TBOX row_box = row->bounding_box();
377 int left_y = row->base_line(row_box.left()) + row->x_height();
378 int left_margin;
379 const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_left(lines.get_line(left_y));
380 LeftMargin(segments_left.get(), row_box.left(), &left_margin);
381
382 if (row_box.top() >= drop_cap_bottom) {
383 int drop_cap_distance = row_box.left() - row->space() - drop_cap_right;
384 if (drop_cap_distance < 0) {
385 drop_cap_distance = 0;
386 }
387 if (drop_cap_distance < left_margin) {
388 left_margin = drop_cap_distance;
389 }
390 }
391
392 int right_y = row->base_line(row_box.right()) + row->x_height();
393 int right_margin;
394 const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_right(lines.get_line(right_y));
395 RightMargin(segments_right.get(), row_box.right(), &right_margin);
396 row->set_lmargin(left_margin);
397 row->set_rmargin(right_margin);
398 }
399 }
400
401 /**********************************************************************
402 * PrintSegmentationStats
403 *
404 * Prints segmentation stats for the given block list.
405 **********************************************************************/
406
407 void PrintSegmentationStats(BLOCK_LIST *block_list) {
408 int num_blocks = 0;
409 int num_rows = 0;
410 int num_words = 0;
411 int num_blobs = 0;
412 BLOCK_IT block_it(block_list);
413 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
414 BLOCK *block = block_it.data();
415 ++num_blocks;
416 ROW_IT row_it(block->row_list());
417 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
418 ++num_rows;
419 ROW *row = row_it.data();
420 // Iterate over all werds in the row.
421 WERD_IT werd_it(row->word_list());
422 for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
423 WERD *werd = werd_it.data();
424 ++num_words;
425 num_blobs += werd->cblob_list()->length();
426 }
427 }
428 }
429 tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n", num_blocks,
430 num_rows, num_words, num_blobs);
431 }
432
433 /**********************************************************************
434 * ExtractBlobsFromSegmentation
435 *
436 * Extracts blobs from the given block list and adds them to the output list.
437 * The block list must have been created by performing a page segmentation.
438 **********************************************************************/
439
440 void ExtractBlobsFromSegmentation(BLOCK_LIST *blocks, C_BLOB_LIST *output_blob_list) {
441 C_BLOB_IT return_list_it(output_blob_list);
442 BLOCK_IT block_it(blocks);
443 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
444 BLOCK *block = block_it.data();
445 ROW_IT row_it(block->row_list());
446 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
447 ROW *row = row_it.data();
448 // Iterate over all werds in the row.
449 WERD_IT werd_it(row->word_list());
450 for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
451 WERD *werd = werd_it.data();
452 return_list_it.move_to_last();
453 return_list_it.add_list_after(werd->cblob_list());
454 return_list_it.move_to_last();
455 return_list_it.add_list_after(werd->rej_cblob_list());
456 }
457 }
458 }
459 }
460
461 /**********************************************************************
462 * RefreshWordBlobsFromNewBlobs()
463 *
464 * Refreshes the words in the block_list by using blobs in the
465 * new_blobs list.
466 * Block list must have word segmentation in it.
467 * It consumes the blobs provided in the new_blobs list. The blobs leftover in
468 * the new_blobs list after the call weren't matched to any blobs of the words
469 * in block list.
470 * The output not_found_blobs is a list of blobs from the original segmentation
471 * in the block_list for which no corresponding new blobs were found.
472 **********************************************************************/
473
474 void RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs,
475 C_BLOB_LIST *not_found_blobs) {
476 // Now iterate over all the blobs in the segmentation_block_list_, and just
477 // replace the corresponding c-blobs inside the werds.
478 BLOCK_IT block_it(block_list);
479 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
480 BLOCK *block = block_it.data();
481 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
482 continue; // Don't touch non-text blocks.
483 }
484 // Iterate over all rows in the block.
485 ROW_IT row_it(block->row_list());
486 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
487 ROW *row = row_it.data();
488 // Iterate over all werds in the row.
489 WERD_IT werd_it(row->word_list());
490 WERD_LIST new_words;
491 WERD_IT new_words_it(&new_words);
492 for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
493 WERD *werd = werd_it.extract();
494 WERD *new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs);
495 if (new_werd) {
496 // Insert this new werd into the actual row's werd-list. Remove the
497 // existing one.
498 new_words_it.add_after_then_move(new_werd);
499 delete werd;
500 } else {
501 // Reinsert the older word back, for lack of better options.
502 // This is critical since dropping the words messes up segmentation:
503 // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on.
504 new_words_it.add_after_then_move(werd);
505 }
506 }
507 // Get rid of the old word list & replace it with the new one.
508 row->word_list()->clear();
509 werd_it.move_to_first();
510 werd_it.add_list_after(&new_words);
511 }
512 }
513 }
514
515 } // namespace tesseract