comparison mupdf-source/thirdparty/tesseract/src/textord/makerow.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: makerow.h (Formerly makerows.h)
3 * Description: Code to arrange blobs into rows of text.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #ifndef MAKEROW_H
20 #define MAKEROW_H
21
22 #include "blobbox.h"
23 #include "blobs.h"
24 #include "ocrblock.h"
25 #include "params.h"
26 #include "statistc.h"
27
28 namespace tesseract {
29
30 enum OVERLAP_STATE {
31 ASSIGN, // assign it to row
32 REJECT, // reject it - dual overlap
33 NEW_ROW
34 };
35
36 enum ROW_CATEGORY {
37 ROW_ASCENDERS_FOUND,
38 ROW_DESCENDERS_FOUND,
39 ROW_UNKNOWN,
40 ROW_INVALID,
41 };
42
43 extern BOOL_VAR_H(textord_heavy_nr);
44 extern BOOL_VAR_H(textord_show_initial_rows);
45 extern BOOL_VAR_H(textord_show_parallel_rows);
46 extern BOOL_VAR_H(textord_show_expanded_rows);
47 extern BOOL_VAR_H(textord_show_final_rows);
48 extern BOOL_VAR_H(textord_show_final_blobs);
49 extern BOOL_VAR_H(textord_test_landscape);
50 extern BOOL_VAR_H(textord_parallel_baselines);
51 extern BOOL_VAR_H(textord_straight_baselines);
52 extern BOOL_VAR_H(textord_old_baselines);
53 extern BOOL_VAR_H(textord_old_xheight);
54 extern BOOL_VAR_H(textord_fix_xheight_bug);
55 extern BOOL_VAR_H(textord_fix_makerow_bug);
56 extern BOOL_VAR_H(textord_debug_xheights);
57 extern INT_VAR_H(textord_test_x);
58 extern INT_VAR_H(textord_test_y);
59 extern INT_VAR_H(textord_min_blobs_in_row);
60 extern INT_VAR_H(textord_spline_minblobs);
61 extern INT_VAR_H(textord_spline_medianwin);
62 extern INT_VAR_H(textord_min_xheight);
63 extern double_VAR_H(textord_spline_shift_fraction);
64 extern double_VAR_H(textord_skew_ile);
65 extern double_VAR_H(textord_skew_lag);
66 extern double_VAR_H(textord_linespace_iqrlimit);
67 extern double_VAR_H(textord_width_limit);
68 extern double_VAR_H(textord_chop_width);
69 extern double_VAR_H(textord_minxh);
70 extern double_VAR_H(textord_min_linesize);
71 extern double_VAR_H(textord_excess_blobsize);
72 extern double_VAR_H(textord_occupancy_threshold);
73 extern double_VAR_H(textord_underline_width);
74 extern double_VAR_H(textord_min_blob_height_fraction);
75 extern double_VAR_H(textord_xheight_mode_fraction);
76 extern double_VAR_H(textord_ascheight_mode_fraction);
77 extern double_VAR_H(textord_ascx_ratio_min);
78 extern double_VAR_H(textord_ascx_ratio_max);
79 extern double_VAR_H(textord_descx_ratio_min);
80 extern double_VAR_H(textord_descx_ratio_max);
81 extern double_VAR_H(textord_xheight_error_margin);
82 extern INT_VAR_H(textord_lms_line_trials);
83 extern BOOL_VAR_H(textord_new_initial_xheight);
84 extern BOOL_VAR_H(textord_debug_blob);
85
86 inline void get_min_max_xheight(int block_linesize, int *min_height, int *max_height) {
87 *min_height = static_cast<int32_t>(floor(block_linesize * textord_minxh));
88 if (*min_height < textord_min_xheight) {
89 *min_height = textord_min_xheight;
90 }
91 *max_height = static_cast<int32_t>(ceil(block_linesize * 3.0));
92 }
93
94 inline ROW_CATEGORY get_row_category(const TO_ROW *row) {
95 if (row->xheight <= 0) {
96 return ROW_INVALID;
97 }
98 return (row->ascrise > 0) ? ROW_ASCENDERS_FOUND
99 : (row->descdrop != 0) ? ROW_DESCENDERS_FOUND : ROW_UNKNOWN;
100 }
101
102 inline bool within_error_margin(float test, float num, float margin) {
103 return (test >= num * (1 - margin) && test <= num * (1 + margin));
104 }
105
106 void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights,
107 STATS *floating_heights);
108
109 float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, TO_BLOCK_LIST *blocks);
110 float make_rows(ICOORD page_tr, // top right
111 TO_BLOCK_LIST *port_blocks);
112 void make_initial_textrows(ICOORD page_tr,
113 TO_BLOCK *block, // block to do
114 FCOORD rotation, // for drawing
115 bool testing_on); // correct orientation
116 void fit_lms_line(TO_ROW *row);
117 void compute_page_skew(TO_BLOCK_LIST *blocks, // list of blocks
118 float &page_m, // average gradient
119 float &page_err); // average error
120 void vigorous_noise_removal(TO_BLOCK *block);
121 void cleanup_rows_making(ICOORD page_tr, // top right
122 TO_BLOCK *block, // block to do
123 float gradient, // gradient to fit
124 FCOORD rotation, // for drawing
125 int32_t block_edge, // edge of block
126 bool testing_on); // correct orientation
127 void delete_non_dropout_rows( // find lines
128 TO_BLOCK *block, // block to do
129 float gradient, // global skew
130 FCOORD rotation, // deskew vector
131 int32_t block_edge, // left edge
132 bool testing_on // correct orientation
133 );
134 bool find_best_dropout_row( // find neighbours
135 TO_ROW *row, // row to test
136 int32_t distance, // dropout dist
137 float dist_limit, // threshold distance
138 int32_t line_index, // index of row
139 TO_ROW_IT *row_it, // current position
140 bool testing_on // correct orientation
141 );
142 TBOX deskew_block_coords( // block box
143 TO_BLOCK *block, // block to do
144 float gradient // global skew
145 );
146 void compute_line_occupation( // project blobs
147 TO_BLOCK *block, // block to do
148 float gradient, // global skew
149 int32_t min_y, // min coord in block
150 int32_t max_y, // in block
151 int32_t *occupation, // output projection
152 int32_t *deltas // derivative
153 );
154 void compute_occupation_threshold( // project blobs
155 int32_t low_window, // below result point
156 int32_t high_window, // above result point
157 int32_t line_count, // array sizes
158 int32_t *occupation, // input projection
159 int32_t *thresholds // output thresholds
160 );
161 void compute_dropout_distances( // project blobs
162 int32_t *occupation, // input projection
163 int32_t *thresholds, // output thresholds
164 int32_t line_count // array sizes
165 );
166 void expand_rows( // find lines
167 ICOORD page_tr, // top right
168 TO_BLOCK *block, // block to do
169 float gradient, // gradient to fit
170 FCOORD rotation, // for drawing
171 int32_t block_edge, // edge of block
172 bool testing_on // correct orientation
173 );
174 void adjust_row_limits( // tidy limits
175 TO_BLOCK *block // block to do
176 );
177 void compute_row_stats( // find lines
178 TO_BLOCK *block, // block to do
179 bool testing_on // correct orientation
180 );
181 float median_block_xheight( // find lines
182 TO_BLOCK *block, // block to do
183 float gradient // global skew
184 );
185
186 int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only,
187 int min_height, int max_height, float *xheight, float *ascrise);
188
189 int32_t compute_row_descdrop(TO_ROW *row, // row to do
190 float gradient, // global skew
191 int xheight_blob_count, STATS *heights);
192 int32_t compute_height_modes(STATS *heights, // stats to search
193 int32_t min_height, // bottom of range
194 int32_t max_height, // top of range
195 int32_t *modes, // output array
196 int32_t maxmodes); // size of modes
197 void correct_row_xheight(TO_ROW *row, // row to fix
198 float xheight, // average values
199 float ascrise, float descdrop);
200 void separate_underlines(TO_BLOCK *block, // block to do
201 float gradient, // skew angle
202 FCOORD rotation, // inverse landscape
203 bool testing_on); // correct orientation
204 void pre_associate_blobs(ICOORD page_tr, // top right
205 TO_BLOCK *block, // block to do
206 FCOORD rotation, // inverse landscape
207 bool testing_on); // correct orientation
208 void fit_parallel_rows(TO_BLOCK *block, // block to do
209 float gradient, // gradient to fit
210 FCOORD rotation, // for drawing
211 int32_t block_edge, // edge of block
212 bool testing_on); // correct orientation
213 void fit_parallel_lms(float gradient, // forced gradient
214 TO_ROW *row); // row to fit
215 void make_baseline_spline(TO_ROW *row, // row to fit
216 TO_BLOCK *block); // block it came from
217 bool segment_baseline( // split baseline
218 TO_ROW *row, // row to fit
219 TO_BLOCK *block, // block it came from
220 int32_t &segments, // no fo segments
221 int32_t *xstarts // coords of segments
222 );
223 double *linear_spline_baseline( // split baseline
224 TO_ROW *row, // row to fit
225 TO_BLOCK *block, // block it came from
226 int32_t &segments, // no fo segments
227 int32_t xstarts[] // coords of segments
228 );
229 void assign_blobs_to_rows( // find lines
230 TO_BLOCK *block, // block to do
231 float *gradient, // block skew
232 int pass, // identification
233 bool reject_misses, // chuck big ones out
234 bool make_new_rows, // add rows for unmatched
235 bool drawing_skew // draw smoothed skew
236 );
237 // find best row
238 OVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, // iterator
239 TO_ROW *&best_row, // output row
240 float top, // top of blob
241 float bottom, // bottom of blob
242 float rowsize, // max row size
243 bool testing_blob // test stuff
244 );
245 int blob_x_order( // sort function
246 const void *item1, // items to compare
247 const void *item2);
248
249 void mark_repeated_chars(TO_ROW *row);
250
251 } // namespace tesseract
252
253 #endif