comparison mupdf-source/thirdparty/tesseract/src/ccstruct/split.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 *
3 * File: split.cpp (Formerly split.c)
4 * Author: Mark Seaman, OCR Technology
5 *
6 * (c) Copyright 1987, Hewlett-Packard Company.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 *************************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "split.h"
25
26 #include "coutln.h"
27 #include "tprintf.h"
28
29 #include <algorithm>
30
31 namespace tesseract {
32
33 /*----------------------------------------------------------------------
34 V a r i a b l e s
35 ----------------------------------------------------------------------*/
36 // Limit on the amount of penalty for the chop being off-center.
37 const int kCenterGradeCap = 25;
38 // Ridiculously large priority for splits that are no use.
39 const double kBadPriority = 999.0;
40
41 BOOL_VAR(wordrec_display_splits, 0, "Display splits");
42
43 // Hides the SPLIT so the outlines appear not to be cut by it.
44 void SPLIT::Hide() const {
45 EDGEPT *edgept = point1;
46 do {
47 edgept->Hide();
48 edgept = edgept->next;
49 } while (!edgept->EqualPos(*point2) && edgept != point1);
50 edgept = point2;
51 do {
52 edgept->Hide();
53 edgept = edgept->next;
54 } while (!edgept->EqualPos(*point1) && edgept != point2);
55 }
56
57 // Undoes hide, so the outlines are cut by the SPLIT.
58 void SPLIT::Reveal() const {
59 EDGEPT *edgept = point1;
60 do {
61 edgept->Reveal();
62 edgept = edgept->next;
63 } while (!edgept->EqualPos(*point2) && edgept != point1);
64 edgept = point2;
65 do {
66 edgept->Reveal();
67 edgept = edgept->next;
68 } while (!edgept->EqualPos(*point1) && edgept != point2);
69 }
70
71 // Compute a split priority based on the bounding boxes of the parts.
72 // The arguments here are config parameters defined in Wordrec. Add chop_
73 // to the beginning of the name.
74 float SPLIT::FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth,
75 double center_knob, double width_change_knob) const {
76 TBOX box1 = Box12();
77 TBOX box2 = Box21();
78 int min_left = std::min(box1.left(), box2.left());
79 int max_right = std::max(box1.right(), box2.right());
80 if (xmin < min_left && xmax > max_right) {
81 return kBadPriority;
82 }
83
84 float grade = 0.0f;
85 // grade_overlap.
86 int width1 = box1.width();
87 int width2 = box2.width();
88 int min_width = std::min(width1, width2);
89 int overlap = -box1.x_gap(box2);
90 if (overlap == min_width) {
91 grade += 100.0f; // Total overlap.
92 } else {
93 if (2 * overlap > min_width) {
94 overlap += 2 * overlap - min_width;
95 }
96 if (overlap > 0) {
97 grade += overlap_knob * overlap;
98 }
99 }
100 // grade_center_of_blob.
101 if (width1 <= centered_maxwidth || width2 <= centered_maxwidth) {
102 grade += std::min(static_cast<double>(kCenterGradeCap), center_knob * abs(width1 - width2));
103 }
104 // grade_width_change.
105 float width_change_grade = 20 - (max_right - min_left - std::max(width1, width2));
106 if (width_change_grade > 0.0f) {
107 grade += width_change_grade * width_change_knob;
108 }
109 return grade;
110 }
111
112 // Returns true if *this SPLIT appears OK in the sense that it does not cross
113 // any outlines and does not chop off any ridiculously small pieces.
114 bool SPLIT::IsHealthy(const TBLOB &blob, int min_points, int min_area) const {
115 return !IsLittleChunk(min_points, min_area) &&
116 !blob.SegmentCrossesOutline(point1->pos, point2->pos);
117 }
118
119 // Returns true if the split generates a small chunk in terms of either area
120 // or number of points.
121 bool SPLIT::IsLittleChunk(int min_points, int min_area) const {
122 if (point1->ShortNonCircularSegment(min_points, point2) &&
123 point1->SegmentArea(point2) < min_area) {
124 return true;
125 }
126 if (point2->ShortNonCircularSegment(min_points, point1) &&
127 point2->SegmentArea(point1) < min_area) {
128 return true;
129 }
130 return false;
131 }
132
133 /**********************************************************************
134 * make_edgept
135 *
136 * Create an EDGEPT and hook it into an existing list of edge points.
137 **********************************************************************/
138 EDGEPT *make_edgept(TDimension x, TDimension y, EDGEPT *next, EDGEPT *prev) {
139 EDGEPT *this_edgept;
140 /* Create point */
141 this_edgept = new EDGEPT;
142 this_edgept->pos.x = x;
143 this_edgept->pos.y = y;
144 // Now deal with the src_outline steps.
145 C_OUTLINE *prev_ol = prev->src_outline;
146 if (prev_ol != nullptr && prev->next == next) {
147 // Compute the fraction of the segment that is being cut.
148 FCOORD segment_vec(next->pos.x - prev->pos.x, next->pos.y - prev->pos.y);
149 FCOORD target_vec(x - prev->pos.x, y - prev->pos.y);
150 double cut_fraction = target_vec.length() / segment_vec.length();
151 // Get the start and end at the step level.
152 ICOORD step_start = prev_ol->position_at_index(prev->start_step);
153 int end_step = prev->start_step + prev->step_count;
154 int step_length = prev_ol->pathlength();
155 ICOORD step_end = prev_ol->position_at_index(end_step % step_length);
156 ICOORD step_vec = step_end - step_start;
157 double target_length = step_vec.length() * cut_fraction;
158 // Find the point on the segment that gives the length nearest to target.
159 int best_step = prev->start_step;
160 ICOORD total_step(0, 0);
161 double best_dist = target_length;
162 for (int s = prev->start_step; s < end_step; ++s) {
163 total_step += prev_ol->step(s % step_length);
164 double dist = fabs(target_length - total_step.length());
165 if (dist < best_dist) {
166 best_dist = dist;
167 best_step = s + 1;
168 }
169 }
170 // The new point is an intermediate point.
171 this_edgept->src_outline = prev_ol;
172 this_edgept->step_count = end_step - best_step;
173 this_edgept->start_step = best_step % step_length;
174 prev->step_count = best_step - prev->start_step;
175 } else {
176 // The new point is poly only.
177 this_edgept->src_outline = nullptr;
178 this_edgept->step_count = 0;
179 this_edgept->start_step = 0;
180 }
181 /* Hook it up */
182 this_edgept->next = next;
183 this_edgept->prev = prev;
184 prev->next = this_edgept;
185 next->prev = this_edgept;
186 /* Set up vec entries */
187 this_edgept->vec.x = this_edgept->next->pos.x - x;
188 this_edgept->vec.y = this_edgept->next->pos.y - y;
189 this_edgept->prev->vec.x = x - this_edgept->prev->pos.x;
190 this_edgept->prev->vec.y = y - this_edgept->prev->pos.y;
191 return this_edgept;
192 }
193
194 /**********************************************************************
195 * remove_edgept
196 *
197 * Remove a given EDGEPT from its list and delete it.
198 **********************************************************************/
199 void remove_edgept(EDGEPT *point) {
200 EDGEPT *prev = point->prev;
201 EDGEPT *next = point->next;
202 // Add point's steps onto prev's steps if they are from the same outline.
203 if (prev->src_outline == point->src_outline && prev->src_outline != nullptr) {
204 prev->step_count += point->step_count;
205 }
206 prev->next = next;
207 next->prev = prev;
208 prev->vec.x = next->pos.x - prev->pos.x;
209 prev->vec.y = next->pos.y - prev->pos.y;
210 delete point;
211 }
212
213 /**********************************************************************
214 * Print
215 *
216 * Shows the coordinates of both points in a split.
217 **********************************************************************/
218 void SPLIT::Print() const {
219 tprintf("(%d,%d)--(%d,%d)", point1->pos.x, point1->pos.y, point2->pos.x, point2->pos.y);
220 }
221
222 #ifndef GRAPHICS_DISABLED
223 // Draws the split in the given window.
224 void SPLIT::Mark(ScrollView *window) const {
225 window->Pen(ScrollView::GREEN);
226 window->Line(point1->pos.x, point1->pos.y, point2->pos.x, point2->pos.y);
227 window->UpdateWindow();
228 }
229 #endif
230
231 // Creates two outlines out of one by splitting the original one in half.
232 // Inserts the resulting outlines into the given list.
233 void SPLIT::SplitOutlineList(TESSLINE *outlines) const {
234 SplitOutline();
235 while (outlines->next != nullptr) {
236 outlines = outlines->next;
237 }
238
239 outlines->next = new TESSLINE;
240 outlines->next->loop = point1;
241 outlines->next->ComputeBoundingBox();
242
243 outlines = outlines->next;
244
245 outlines->next = new TESSLINE;
246 outlines->next->loop = point2;
247 outlines->next->ComputeBoundingBox();
248
249 outlines->next->next = nullptr;
250 }
251
252 // Makes a split between these two edge points, but does not affect the
253 // outlines to which they belong.
254 void SPLIT::SplitOutline() const {
255 EDGEPT *temp2 = point2->next;
256 EDGEPT *temp1 = point1->next;
257 /* Create two new points */
258 EDGEPT *new_point1 = make_edgept(point1->pos.x, point1->pos.y, temp1, point2);
259 EDGEPT *new_point2 = make_edgept(point2->pos.x, point2->pos.y, temp2, point1);
260 // point1 and 2 are now cross-over points, so they must have nullptr
261 // src_outlines and give their src_outline information their new
262 // replacements.
263 new_point1->src_outline = point1->src_outline;
264 new_point1->start_step = point1->start_step;
265 new_point1->step_count = point1->step_count;
266 new_point2->src_outline = point2->src_outline;
267 new_point2->start_step = point2->start_step;
268 new_point2->step_count = point2->step_count;
269 point1->src_outline = nullptr;
270 point1->start_step = 0;
271 point1->step_count = 0;
272 point2->src_outline = nullptr;
273 point2->start_step = 0;
274 point2->step_count = 0;
275 }
276
277 // Undoes the effect of SplitOutlineList, correcting the outlines for undoing
278 // the split, but possibly leaving some duplicate outlines.
279 void SPLIT::UnsplitOutlineList(TBLOB *blob) const {
280 /* Modify edge points */
281 UnsplitOutlines();
282
283 auto *outline1 = new TESSLINE;
284 outline1->next = blob->outlines;
285 blob->outlines = outline1;
286 outline1->loop = point1;
287
288 auto *outline2 = new TESSLINE;
289 outline2->next = blob->outlines;
290 blob->outlines = outline2;
291 outline2->loop = point2;
292 }
293
294 // Removes the split that was put between these two points.
295 void SPLIT::UnsplitOutlines() const {
296 EDGEPT *tmp1 = point1->next;
297 EDGEPT *tmp2 = point2->next;
298
299 tmp1->next->prev = point2;
300 tmp2->next->prev = point1;
301
302 // tmp2 is coincident with point1. point1 takes tmp2's place as tmp2 is
303 // deleted.
304 point1->next = tmp2->next;
305 point1->src_outline = tmp2->src_outline;
306 point1->start_step = tmp2->start_step;
307 point1->step_count = tmp2->step_count;
308 // Likewise point2 takes tmp1's place.
309 point2->next = tmp1->next;
310 point2->src_outline = tmp1->src_outline;
311 point2->start_step = tmp1->start_step;
312 point2->step_count = tmp1->step_count;
313
314 delete tmp1;
315 delete tmp2;
316
317 point1->vec.x = point1->next->pos.x - point1->pos.x;
318 point1->vec.y = point1->next->pos.y - point1->pos.y;
319
320 point2->vec.x = point2->next->pos.x - point2->pos.x;
321 point2->vec.y = point2->next->pos.y - point2->pos.y;
322 }
323
324 } // namespace tesseract