comparison mupdf-source/thirdparty/tesseract/src/textord/underlin.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: underlin.cpp (Formerly undrline.c)
3 * Description: Code to chop blobs apart from underlines.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1994, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "underlin.h"
20
21 namespace tesseract {
22
23 double_VAR(textord_underline_offset, 0.1, "Fraction of x to ignore");
24 BOOL_VAR(textord_restore_underlines, true, "Chop underlines & put back");
25
26 /**********************************************************************
27 * restore_underlined_blobs
28 *
29 * Find underlined blobs and put them back in the row.
30 **********************************************************************/
31
32 void restore_underlined_blobs( // get chop points
33 TO_BLOCK *block // block to do
34 ) {
35 int16_t chop_coord; // chop boundary
36 TBOX blob_box; // of underline
37 BLOBNBOX *u_line; // underline bit
38 TO_ROW *row; // best row for blob
39 ICOORDELT_LIST chop_cells; // blobs to cut out
40 // real underlines
41 BLOBNBOX_LIST residual_underlines;
42 C_OUTLINE_LIST left_coutlines;
43 C_OUTLINE_LIST right_coutlines;
44 ICOORDELT_IT cell_it = &chop_cells;
45 // under lines
46 BLOBNBOX_IT under_it = &block->underlines;
47 BLOBNBOX_IT ru_it = &residual_underlines;
48
49 if (block->get_rows()->empty()) {
50 return; // Don't crash if there are no rows.
51 }
52 for (under_it.mark_cycle_pt(); !under_it.cycled_list(); under_it.forward()) {
53 u_line = under_it.extract();
54 blob_box = u_line->bounding_box();
55 row = most_overlapping_row(block->get_rows(), u_line);
56 if (row == nullptr) {
57 return; // Don't crash if there is no row.
58 }
59 find_underlined_blobs(u_line, &row->baseline, row->xheight,
60 row->xheight * textord_underline_offset, &chop_cells);
61 cell_it.set_to_list(&chop_cells);
62 for (cell_it.mark_cycle_pt(); !cell_it.cycled_list(); cell_it.forward()) {
63 chop_coord = cell_it.data()->x();
64 if (cell_it.data()->y() - chop_coord > textord_fp_chop_error + 1) {
65 split_to_blob(u_line, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines,
66 &right_coutlines);
67 if (!left_coutlines.empty()) {
68 ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines)));
69 }
70 chop_coord = cell_it.data()->y();
71 split_to_blob(nullptr, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines,
72 &right_coutlines);
73 if (!left_coutlines.empty()) {
74 row->insert_blob(new BLOBNBOX(new C_BLOB(&left_coutlines)));
75 }
76 u_line = nullptr; // no more blobs to add
77 }
78 delete cell_it.extract();
79 }
80 if (!right_coutlines.empty()) {
81 split_to_blob(nullptr, blob_box.right(), textord_fp_chop_error + 0.5, &left_coutlines,
82 &right_coutlines);
83 if (!left_coutlines.empty()) {
84 ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines)));
85 }
86 }
87 delete u_line;
88 }
89 if (!ru_it.empty()) {
90 ru_it.move_to_first();
91 for (ru_it.mark_cycle_pt(); !ru_it.cycled_list(); ru_it.forward()) {
92 under_it.add_after_then_move(ru_it.extract());
93 }
94 }
95 }
96
97 /**********************************************************************
98 * most_overlapping_row
99 *
100 * Return the row which most overlaps the blob.
101 **********************************************************************/
102
103 TO_ROW *most_overlapping_row( // find best row
104 TO_ROW_LIST *rows, // list of rows
105 BLOBNBOX *blob // blob to place
106 ) {
107 int16_t x = (blob->bounding_box().left() + blob->bounding_box().right()) / 2;
108 TO_ROW_IT row_it = rows; // row iterator
109 TO_ROW *row; // current row
110 TO_ROW *best_row; // output row
111 float overlap; // of blob & row
112 float bestover; // best overlap
113
114 best_row = nullptr;
115 bestover = static_cast<float>(-INT32_MAX);
116 if (row_it.empty()) {
117 return nullptr;
118 }
119 row = row_it.data();
120 row_it.mark_cycle_pt();
121 while (row->baseline.y(x) + row->descdrop > blob->bounding_box().top() && !row_it.cycled_list()) {
122 best_row = row;
123 bestover = blob->bounding_box().top() - row->baseline.y(x) + row->descdrop;
124 row_it.forward();
125 row = row_it.data();
126 }
127 while (row->baseline.y(x) + row->xheight + row->ascrise >= blob->bounding_box().bottom() &&
128 !row_it.cycled_list()) {
129 overlap = row->baseline.y(x) + row->xheight + row->ascrise;
130 if (blob->bounding_box().top() < overlap) {
131 overlap = blob->bounding_box().top();
132 }
133 if (blob->bounding_box().bottom() > row->baseline.y(x) + row->descdrop) {
134 overlap -= blob->bounding_box().bottom();
135 } else {
136 overlap -= row->baseline.y(x) + row->descdrop;
137 }
138 if (overlap > bestover) {
139 bestover = overlap;
140 best_row = row;
141 }
142 row_it.forward();
143 row = row_it.data();
144 }
145 if (bestover < 0 &&
146 row->baseline.y(x) + row->xheight + row->ascrise - blob->bounding_box().bottom() > bestover) {
147 best_row = row;
148 }
149 return best_row;
150 }
151
152 /**********************************************************************
153 * find_underlined_blobs
154 *
155 * Find the start and end coords of blobs in the underline.
156 **********************************************************************/
157
158 void find_underlined_blobs( // get chop points
159 BLOBNBOX *u_line, // underlined unit
160 QSPLINE *baseline, // actual baseline
161 float xheight, // height of line
162 float baseline_offset, // amount to shrinke it
163 ICOORDELT_LIST *chop_cells // places to chop
164 ) {
165 ICOORD blob_chop; // sides of blob
166 TBOX blob_box = u_line->bounding_box();
167 // cell iterator
168 ICOORDELT_IT cell_it = chop_cells;
169 STATS upper_proj(blob_box.left(), blob_box.right());
170 STATS middle_proj(blob_box.left(), blob_box.right());
171 STATS lower_proj(blob_box.left(), blob_box.right());
172 C_OUTLINE_IT out_it; // outlines of blob
173
174 ASSERT_HOST(u_line->cblob() != nullptr);
175
176 out_it.set_to_list(u_line->cblob()->out_list());
177 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
178 vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, &lower_proj,
179 &middle_proj, &upper_proj);
180 }
181
182 for (auto x = blob_box.left(); x < blob_box.right(); x++) {
183 if (middle_proj.pile_count(x) > 0) {
184 auto y = x + 1;
185 for (; y < blob_box.right() && middle_proj.pile_count(y) > 0; y++) {
186 ;
187 }
188 blob_chop = ICOORD(x, y);
189 cell_it.add_after_then_move(new ICOORDELT(blob_chop));
190 x = y;
191 }
192 }
193 }
194
195 /**********************************************************************
196 * vertical_cunderline_projection
197 *
198 * Compute the vertical projection of an outline from its outlines
199 * and add to the given STATS.
200 **********************************************************************/
201
202 void vertical_cunderline_projection( // project outlines
203 C_OUTLINE *outline, // outline to project
204 QSPLINE *baseline, // actual baseline
205 float xheight, // height of line
206 float baseline_offset, // amount to shrinke it
207 STATS *lower_proj, // below baseline
208 STATS *middle_proj, // centre region
209 STATS *upper_proj // top region
210 ) {
211 ICOORD pos; // current point
212 ICOORD step; // edge step
213 int16_t lower_y, upper_y; // region limits
214 C_OUTLINE_IT out_it = outline->child();
215
216 pos = outline->start_pos();
217 int16_t length = outline->pathlength();
218 for (int16_t stepindex = 0; stepindex < length; stepindex++) {
219 step = outline->step(stepindex);
220 if (step.x() > 0) {
221 lower_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + 0.5));
222 upper_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + xheight + 0.5));
223 if (pos.y() >= lower_y) {
224 lower_proj->add(pos.x(), -lower_y);
225 if (pos.y() >= upper_y) {
226 middle_proj->add(pos.x(), lower_y - upper_y);
227 upper_proj->add(pos.x(), upper_y - pos.y());
228 } else {
229 middle_proj->add(pos.x(), lower_y - pos.y());
230 }
231 } else {
232 lower_proj->add(pos.x(), -pos.y());
233 }
234 } else if (step.x() < 0) {
235 lower_y = static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + 0.5));
236 upper_y =
237 static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + xheight + 0.5));
238 if (pos.y() >= lower_y) {
239 lower_proj->add(pos.x() - 1, lower_y);
240 if (pos.y() >= upper_y) {
241 middle_proj->add(pos.x() - 1, upper_y - lower_y);
242 upper_proj->add(pos.x() - 1, pos.y() - upper_y);
243 } else {
244 middle_proj->add(pos.x() - 1, pos.y() - lower_y);
245 }
246 } else {
247 lower_proj->add(pos.x() - 1, pos.y());
248 }
249 }
250 pos += step;
251 }
252
253 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
254 vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, lower_proj,
255 middle_proj, upper_proj);
256 }
257 }
258
259 } // namespace tesseract