comparison mupdf-source/thirdparty/tesseract/src/textord/textord.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: textord.h
3 // Description: The Textord class definition gathers text line and word
4 // finding functionality.
5 // Author: Ray Smith
6 // Created: Fri Mar 13 14:29:01 PDT 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20
21 #ifndef TESSERACT_TEXTORD_TEXTORD_H_
22 #define TESSERACT_TEXTORD_TEXTORD_H_
23
24 #include "bbgrid.h"
25 #include "blobbox.h"
26 #include "ccstruct.h"
27 #include "gap_map.h"
28
29 #include <tesseract/publictypes.h> // For PageSegMode.
30
31 namespace tesseract {
32
33 class FCOORD;
34 class BLOCK_LIST;
35 class PAGE_RES;
36 class TO_BLOCK;
37 class TO_BLOCK_LIST;
38 class ScrollView;
39
40 // A simple class that can be used by BBGrid to hold a word and an expanded
41 // bounding box that makes it easy to find words to put diacritics.
42 class WordWithBox {
43 public:
44 WordWithBox() : word_(nullptr) {}
45 explicit WordWithBox(WERD *word) : word_(word), bounding_box_(word->bounding_box()) {
46 int height = bounding_box_.height();
47 bounding_box_.pad(height, height);
48 }
49
50 const TBOX &bounding_box() const {
51 return bounding_box_;
52 }
53 // Returns the bounding box of only the good blobs.
54 TBOX true_bounding_box() const {
55 return word_->true_bounding_box();
56 }
57 C_BLOB_LIST *RejBlobs() const {
58 return word_->rej_cblob_list();
59 }
60 const WERD *word() const {
61 return word_;
62 }
63
64 private:
65 // Borrowed pointer to a real word somewhere that must outlive this class.
66 WERD *word_;
67 // Cached expanded bounding box of the word, padded all round by its height.
68 TBOX bounding_box_;
69 };
70
71 // Make it usable by BBGrid.
72 CLISTIZEH(WordWithBox)
73 using WordGrid = BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT>;
74 using WordSearch = GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT>;
75
76 class Textord {
77 public:
78 explicit Textord(CCStruct *ccstruct);
79 ~Textord() = default;
80
81 // Make the textlines and words inside each block.
82 // binary_pix is mandatory and is the binarized input after line removal.
83 // grey_pix is optional, but if present must match the binary_pix in size,
84 // and must be a *real* grey image instead of binary_pix * 255.
85 // thresholds_pix is expected to be present iff grey_pix is present and
86 // can be an integer factor reduction of the grey_pix. It represents the
87 // thresholds that were used to create the binary_pix from the grey_pix.
88 // diacritic_blobs contain small confusing components that should be added
89 // to the appropriate word(s) in case they are really diacritics.
90 void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height,
91 Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms,
92 BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
93 float *gradient);
94
95 // If we were supposed to return only a single textline, and there is more
96 // than one, clean up and leave only the best.
97 void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res);
98
99 bool use_cjk_fp_model() const {
100 return use_cjk_fp_model_;
101 }
102 void set_use_cjk_fp_model(bool flag) {
103 use_cjk_fp_model_ = flag;
104 }
105
106 // tospace.cpp ///////////////////////////////////////////
107 void to_spacing(ICOORD page_tr, // topright of page
108 TO_BLOCK_LIST *blocks // blocks on page
109 );
110 ROW *make_prop_words(TO_ROW *row, // row to make
111 FCOORD rotation // for drawing
112 );
113 ROW *make_blob_words(TO_ROW *row, // row to make
114 FCOORD rotation // for drawing
115 );
116 // tordmain.cpp ///////////////////////////////////////////
117 void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
118 void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on);
119
120 private:
121 // For underlying memory management and other utilities.
122 CCStruct *ccstruct_;
123
124 // The size of the input image.
125 ICOORD page_tr_;
126
127 bool use_cjk_fp_model_;
128
129 // makerow.cpp ///////////////////////////////////////////
130 // Make the textlines inside each block.
131 void MakeRows(PageSegMode pageseg_mode, const FCOORD &skew, int width, int height,
132 TO_BLOCK_LIST *to_blocks);
133 // Make the textlines inside a single block.
134 void MakeBlockRows(int min_spacing, int max_spacing, const FCOORD &skew, TO_BLOCK *block,
135 ScrollView *win);
136
137 public:
138 void compute_block_xheight(TO_BLOCK *block, float gradient);
139 void compute_row_xheight(TO_ROW *row, // row to do
140 const FCOORD &rotation,
141 float gradient, // global skew
142 int block_line_size);
143 void make_spline_rows(TO_BLOCK *block, // block to do
144 float gradient, // gradient to fit
145 bool testing_on);
146
147 private:
148 //// oldbasel.cpp ////////////////////////////////////////
149 void make_old_baselines(TO_BLOCK *block, // block to do
150 bool testing_on, // correct orientation
151 float gradient);
152 void correlate_lines(TO_BLOCK *block, float gradient);
153 void correlate_neighbours(TO_BLOCK *block, // block rows are in.
154 TO_ROW **rows, // rows of block.
155 int rowcount); // no of rows to do.
156 int correlate_with_stats(TO_ROW **rows, // rows of block.
157 int rowcount, // no of rows to do.
158 TO_BLOCK *block);
159 void find_textlines(TO_BLOCK *block, // block row is in
160 TO_ROW *row, // row to do
161 int degree, // required approximation
162 QSPLINE *spline); // starting spline
163 // tospace.cpp ///////////////////////////////////////////
164 // DEBUG USE ONLY
165 void block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
166 // resulting estimate
167 int16_t &block_space_gap_width,
168 // resulting estimate
169 int16_t &block_non_space_gap_width);
170 void row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
171 // estimate for block
172 int16_t block_space_gap_width,
173 // estimate for block
174 int16_t block_non_space_gap_width);
175 void old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
176 STATS *small_gap_stats, int16_t block_space_gap_width,
177 // estimate for block
178 int16_t block_non_space_gap_width);
179 bool isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, bool suspected_table,
180 int16_t block_idx, int16_t row_idx);
181 int16_t stats_count_under(STATS *stats, int16_t threshold);
182 void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats);
183 bool make_a_word_break(TO_ROW *row, // row being made
184 TBOX blob_box, // for next_blob // how many blanks?
185 int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
186 int16_t within_xht_current_gap, TBOX next_blob_box, int16_t next_gap,
187 uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
188 bool &prev_gap_was_a_space, bool &break_at_next_gap);
189 bool narrow_blob(TO_ROW *row, TBOX blob_box);
190 bool wide_blob(TO_ROW *row, TBOX blob_box);
191 bool suspected_punct_blob(TO_ROW *row, TBOX box);
192 void peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box, int16_t &next_gap,
193 int16_t &next_within_xht_gap);
194 void mark_gap(TBOX blob, // blob following gap
195 int16_t rule, // heuristic id
196 int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
197 int16_t next_blob_width, int16_t next_gap);
198 float find_mean_blob_spacing(WERD *word);
199 bool ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left, int16_t right);
200 // get bounding box
201 TBOX reduced_box_next(TO_ROW *row, // current row
202 BLOBNBOX_IT *it // iterator to blobds
203 );
204 TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht);
205 // tordmain.cpp ///////////////////////////////////////////
206 float filter_noise_blobs(BLOBNBOX_LIST *src_list, BLOBNBOX_LIST *noise_list,
207 BLOBNBOX_LIST *small_list, BLOBNBOX_LIST *large_list);
208 // Fixes the block so it obeys all the rules:
209 // Must have at least one ROW.
210 // Must have at least one WERD.
211 // WERDs contain a fake blob.
212 void cleanup_nontext_block(BLOCK *block);
213 void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
214 bool clean_noise_from_row(ROW *row);
215 void clean_noise_from_words(ROW *row);
216 // Remove outlines that are a tiny fraction in either width or height
217 // of the word height.
218 void clean_small_noise_from_words(ROW *row);
219 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
220 // TransferDiacriticsToWords to copy the diacritic blobs to the most
221 // appropriate words in the group of blocks. Source blobs are not touched.
222 void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks);
223 // Places a copy of blobs that are near a word (after applying rotation to the
224 // blob) in the most appropriate word, unless there is doubt, in which case a
225 // blob can end up in two words. Source blobs are not touched.
226 void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation,
227 WordGrid *word_grid);
228
229 public:
230 // makerow.cpp ///////////////////////////////////////////
231 BOOL_VAR_H(textord_single_height_mode);
232 // tospace.cpp ///////////////////////////////////////////
233 BOOL_VAR_H(tosp_old_to_method);
234 BOOL_VAR_H(tosp_old_to_constrain_sp_kn);
235 BOOL_VAR_H(tosp_only_use_prop_rows);
236 BOOL_VAR_H(tosp_force_wordbreak_on_punct);
237 BOOL_VAR_H(tosp_use_pre_chopping);
238 BOOL_VAR_H(tosp_old_to_bug_fix);
239 BOOL_VAR_H(tosp_block_use_cert_spaces);
240 BOOL_VAR_H(tosp_row_use_cert_spaces);
241 BOOL_VAR_H(tosp_narrow_blobs_not_cert);
242 BOOL_VAR_H(tosp_row_use_cert_spaces1);
243 BOOL_VAR_H(tosp_recovery_isolated_row_stats);
244 BOOL_VAR_H(tosp_only_small_gaps_for_kern);
245 BOOL_VAR_H(tosp_all_flips_fuzzy);
246 BOOL_VAR_H(tosp_fuzzy_limit_all);
247 BOOL_VAR_H(tosp_stats_use_xht_gaps);
248 BOOL_VAR_H(tosp_use_xht_gaps);
249 BOOL_VAR_H(tosp_only_use_xht_gaps);
250 BOOL_VAR_H(tosp_rule_9_test_punct);
251 BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp);
252 BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn);
253 BOOL_VAR_H(tosp_improve_thresh);
254 INT_VAR_H(tosp_debug_level);
255 INT_VAR_H(tosp_enough_space_samples_for_median);
256 INT_VAR_H(tosp_redo_kern_limit);
257 INT_VAR_H(tosp_few_samples);
258 INT_VAR_H(tosp_short_row);
259 INT_VAR_H(tosp_sanity_method);
260 double_VAR_H(tosp_old_sp_kn_th_factor);
261 double_VAR_H(tosp_threshold_bias1);
262 double_VAR_H(tosp_threshold_bias2);
263 double_VAR_H(tosp_narrow_fraction);
264 double_VAR_H(tosp_narrow_aspect_ratio);
265 double_VAR_H(tosp_wide_fraction);
266 double_VAR_H(tosp_wide_aspect_ratio);
267 double_VAR_H(tosp_fuzzy_space_factor);
268 double_VAR_H(tosp_fuzzy_space_factor1);
269 double_VAR_H(tosp_fuzzy_space_factor2);
270 double_VAR_H(tosp_gap_factor);
271 double_VAR_H(tosp_kern_gap_factor1);
272 double_VAR_H(tosp_kern_gap_factor2);
273 double_VAR_H(tosp_kern_gap_factor3);
274 double_VAR_H(tosp_ignore_big_gaps);
275 double_VAR_H(tosp_ignore_very_big_gaps);
276 double_VAR_H(tosp_rep_space);
277 double_VAR_H(tosp_enough_small_gaps);
278 double_VAR_H(tosp_table_kn_sp_ratio);
279 double_VAR_H(tosp_table_xht_sp_ratio);
280 double_VAR_H(tosp_table_fuzzy_kn_sp_ratio);
281 double_VAR_H(tosp_fuzzy_kn_fraction);
282 double_VAR_H(tosp_fuzzy_sp_fraction);
283 double_VAR_H(tosp_min_sane_kn_sp);
284 double_VAR_H(tosp_init_guess_kn_mult);
285 double_VAR_H(tosp_init_guess_xht_mult);
286 double_VAR_H(tosp_max_sane_kn_thresh);
287 double_VAR_H(tosp_flip_caution);
288 double_VAR_H(tosp_large_kerning);
289 double_VAR_H(tosp_dont_fool_with_small_kerns);
290 double_VAR_H(tosp_near_lh_edge);
291 double_VAR_H(tosp_silly_kn_sp_gap);
292 double_VAR_H(tosp_pass_wide_fuzz_sp_to_context);
293 // tordmain.cpp ///////////////////////////////////////////
294 BOOL_VAR_H(textord_no_rejects);
295 BOOL_VAR_H(textord_show_blobs);
296 BOOL_VAR_H(textord_show_boxes);
297 INT_VAR_H(textord_max_noise_size);
298 INT_VAR_H(textord_baseline_debug);
299 double_VAR_H(textord_noise_area_ratio);
300 double_VAR_H(textord_initialx_ile);
301 double_VAR_H(textord_initialasc_ile);
302 INT_VAR_H(textord_noise_sizefraction);
303 double_VAR_H(textord_noise_sizelimit);
304 INT_VAR_H(textord_noise_translimit);
305 double_VAR_H(textord_noise_normratio);
306 BOOL_VAR_H(textord_noise_rejwords);
307 BOOL_VAR_H(textord_noise_rejrows);
308 double_VAR_H(textord_noise_syfract);
309 double_VAR_H(textord_noise_sxfract);
310 double_VAR_H(textord_noise_hfract);
311 INT_VAR_H(textord_noise_sncount);
312 double_VAR_H(textord_noise_rowratio);
313 BOOL_VAR_H(textord_noise_debug);
314 double_VAR_H(textord_blshift_maxshift);
315 double_VAR_H(textord_blshift_xfraction);
316 };
317
318 } // namespace tesseract
319
320 #endif // TESSERACT_TEXTORD_TEXTORD_H_