Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/textord.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: textord.cpp | |
| 3 // Description: The top-level text line and word finding functionality. | |
| 4 // Author: Ray Smith | |
| 5 // Created: Fri Mar 13 14:43:01 PDT 2009 | |
| 6 // | |
| 7 // (C) Copyright 2009, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 // Include automatically generated configuration file if running autoconf. | |
| 21 #ifdef HAVE_CONFIG_H | |
| 22 # include "config_auto.h" | |
| 23 #endif | |
| 24 | |
| 25 #include "baselinedetect.h" | |
| 26 #include "drawtord.h" | |
| 27 #include "makerow.h" | |
| 28 #include "pageres.h" | |
| 29 #include "textord.h" | |
| 30 #include "tordmain.h" | |
| 31 #include "wordseg.h" | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 Textord::Textord(CCStruct *ccstruct) | |
| 36 : ccstruct_(ccstruct) | |
| 37 , use_cjk_fp_model_(false) | |
| 38 , | |
| 39 // makerow.cpp /////////////////////////////////////////// | |
| 40 BOOL_MEMBER(textord_single_height_mode, false, "Script has no xheight, so use a single mode", | |
| 41 ccstruct_->params()) | |
| 42 , | |
| 43 // tospace.cpp /////////////////////////////////////////// | |
| 44 BOOL_MEMBER(tosp_old_to_method, false, "Space stats use prechopping?", ccstruct_->params()) | |
| 45 , BOOL_MEMBER(tosp_old_to_constrain_sp_kn, false, | |
| 46 "Constrain relative values of inter and intra-word gaps for " | |
| 47 "old_to_method.", | |
| 48 ccstruct_->params()) | |
| 49 , BOOL_MEMBER(tosp_only_use_prop_rows, true, "Block stats to use fixed pitch rows?", | |
| 50 ccstruct_->params()) | |
| 51 , BOOL_MEMBER(tosp_force_wordbreak_on_punct, false, | |
| 52 "Force word breaks on punct to break long lines in non-space " | |
| 53 "delimited langs", | |
| 54 ccstruct_->params()) | |
| 55 , BOOL_MEMBER(tosp_use_pre_chopping, false, "Space stats use prechopping?", ccstruct_->params()) | |
| 56 , BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code", ccstruct_->params()) | |
| 57 , BOOL_MEMBER(tosp_block_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()) | |
| 58 , BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()) | |
| 59 , BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, "Only stat OBVIOUS spaces", ccstruct_->params()) | |
| 60 , BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces", ccstruct_->params()) | |
| 61 , BOOL_MEMBER(tosp_recovery_isolated_row_stats, true, | |
| 62 "Use row alone when inadequate cert spaces", ccstruct_->params()) | |
| 63 , BOOL_MEMBER(tosp_only_small_gaps_for_kern, false, "Better guess", ccstruct_->params()) | |
| 64 , BOOL_MEMBER(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?", ccstruct_->params()) | |
| 65 , BOOL_MEMBER(tosp_fuzzy_limit_all, true, "Don't restrict kn->sp fuzzy limit to tables", | |
| 66 ccstruct_->params()) | |
| 67 , BOOL_MEMBER(tosp_stats_use_xht_gaps, true, "Use within xht gap for wd breaks", | |
| 68 ccstruct_->params()) | |
| 69 , BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks", ccstruct_->params()) | |
| 70 , BOOL_MEMBER(tosp_only_use_xht_gaps, false, "Only use within xht gap for wd breaks", | |
| 71 ccstruct_->params()) | |
| 72 , BOOL_MEMBER(tosp_rule_9_test_punct, false, "Don't chng kn to space next to punct", | |
| 73 ccstruct_->params()) | |
| 74 , BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip", ccstruct_->params()) | |
| 75 , BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip", ccstruct_->params()) | |
| 76 , BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic", ccstruct_->params()) | |
| 77 , INT_MEMBER(tosp_debug_level, 0, "Debug data", ccstruct_->params()) | |
| 78 , INT_MEMBER(tosp_enough_space_samples_for_median, 3, "or should we use mean", | |
| 79 ccstruct_->params()) | |
| 80 , INT_MEMBER(tosp_redo_kern_limit, 10, "No.samples reqd to reestimate for row", | |
| 81 ccstruct_->params()) | |
| 82 , INT_MEMBER(tosp_few_samples, 40, "No.gaps reqd with 1 large gap to treat as a table", | |
| 83 ccstruct_->params()) | |
| 84 , INT_MEMBER(tosp_short_row, 20, "No.gaps reqd with few cert spaces to use certs", | |
| 85 ccstruct_->params()) | |
| 86 , INT_MEMBER(tosp_sanity_method, 1, "How to avoid being silly", ccstruct_->params()) | |
| 87 , double_MEMBER(tosp_old_sp_kn_th_factor, 2.0, | |
| 88 "Factor for defining space threshold in terms of space and " | |
| 89 "kern sizes", | |
| 90 ccstruct_->params()) | |
| 91 , double_MEMBER(tosp_threshold_bias1, 0, "how far between kern and space?", ccstruct_->params()) | |
| 92 , double_MEMBER(tosp_threshold_bias2, 0, "how far between kern and space?", ccstruct_->params()) | |
| 93 , double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow", ccstruct_->params()) | |
| 94 , double_MEMBER(tosp_narrow_aspect_ratio, 0.48, "narrow if w/h less than this", | |
| 95 ccstruct_->params()) | |
| 96 , double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide", ccstruct_->params()) | |
| 97 , double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this", ccstruct_->params()) | |
| 98 , double_MEMBER(tosp_fuzzy_space_factor, 0.6, "Fract of xheight for fuzz sp", | |
| 99 ccstruct_->params()) | |
| 100 , double_MEMBER(tosp_fuzzy_space_factor1, 0.5, "Fract of xheight for fuzz sp", | |
| 101 ccstruct_->params()) | |
| 102 , double_MEMBER(tosp_fuzzy_space_factor2, 0.72, "Fract of xheight for fuzz sp", | |
| 103 ccstruct_->params()) | |
| 104 , double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern", ccstruct_->params()) | |
| 105 , double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp", ccstruct_->params()) | |
| 106 , double_MEMBER(tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp", ccstruct_->params()) | |
| 107 , double_MEMBER(tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp", ccstruct_->params()) | |
| 108 , double_MEMBER(tosp_ignore_big_gaps, -1, "xht multiplier", ccstruct_->params()) | |
| 109 , double_MEMBER(tosp_ignore_very_big_gaps, 3.5, "xht multiplier", ccstruct_->params()) | |
| 110 , double_MEMBER(tosp_rep_space, 1.6, "rep gap multiplier for space", ccstruct_->params()) | |
| 111 , double_MEMBER(tosp_enough_small_gaps, 0.65, "Fract of kerns reqd for isolated row stats", | |
| 112 ccstruct_->params()) | |
| 113 , double_MEMBER(tosp_table_kn_sp_ratio, 2.25, "Min difference of kn & sp in table", | |
| 114 ccstruct_->params()) | |
| 115 , double_MEMBER(tosp_table_xht_sp_ratio, 0.33, "Expect spaces bigger than this", | |
| 116 ccstruct_->params()) | |
| 117 , double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0, "Fuzzy if less than this", | |
| 118 ccstruct_->params()) | |
| 119 , double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg", ccstruct_->params()) | |
| 120 , double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg", ccstruct_->params()) | |
| 121 , double_MEMBER(tosp_min_sane_kn_sp, 1.5, "Don't trust spaces less than this time kn", | |
| 122 ccstruct_->params()) | |
| 123 , double_MEMBER(tosp_init_guess_kn_mult, 2.2, "Thresh guess - mult kn by this", | |
| 124 ccstruct_->params()) | |
| 125 , double_MEMBER(tosp_init_guess_xht_mult, 0.28, "Thresh guess - mult xht by this", | |
| 126 ccstruct_->params()) | |
| 127 , double_MEMBER(tosp_max_sane_kn_thresh, 5.0, "Multiplier on kn to limit thresh", | |
| 128 ccstruct_->params()) | |
| 129 , double_MEMBER(tosp_flip_caution, 0.0, "Don't autoflip kn to sp when large separation", | |
| 130 ccstruct_->params()) | |
| 131 , double_MEMBER(tosp_large_kerning, 0.19, "Limit use of xht gap with large kns", | |
| 132 ccstruct_->params()) | |
| 133 , double_MEMBER(tosp_dont_fool_with_small_kerns, -1, "Limit use of xht gap with odd small kns", | |
| 134 ccstruct_->params()) | |
| 135 , double_MEMBER(tosp_near_lh_edge, 0, "Don't reduce box if the top left is non blank", | |
| 136 ccstruct_->params()) | |
| 137 , double_MEMBER(tosp_silly_kn_sp_gap, 0.2, "Don't let sp minus kn get too small", | |
| 138 ccstruct_->params()) | |
| 139 , double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75, "How wide fuzzies need context", | |
| 140 ccstruct_->params()) | |
| 141 , | |
| 142 // tordmain.cpp /////////////////////////////////////////// | |
| 143 BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs", ccstruct_->params()) | |
| 144 , BOOL_MEMBER(textord_show_blobs, false, "Display unsorted blobs", ccstruct_->params()) | |
| 145 , BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs", ccstruct_->params()) | |
| 146 , INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise", ccstruct_->params()) | |
| 147 , INT_MEMBER(textord_baseline_debug, 0, "Baseline debug level", ccstruct_->params()) | |
| 148 , double_MEMBER(textord_noise_area_ratio, 0.7, "Fraction of bounding box for noise", | |
| 149 ccstruct_->params()) | |
| 150 , double_MEMBER(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess", | |
| 151 ccstruct_->params()) | |
| 152 , double_MEMBER(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess", | |
| 153 ccstruct_->params()) | |
| 154 , INT_MEMBER(textord_noise_sizefraction, 10, "Fraction of size for maxima", ccstruct_->params()) | |
| 155 , double_MEMBER(textord_noise_sizelimit, 0.5, "Fraction of x for big t count", | |
| 156 ccstruct_->params()) | |
| 157 , INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob", ccstruct_->params()) | |
| 158 , double_MEMBER(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion", | |
| 159 ccstruct_->params()) | |
| 160 , BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words", ccstruct_->params()) | |
| 161 , BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows", ccstruct_->params()) | |
| 162 , double_MEMBER(textord_noise_syfract, 0.2, "xh fract height error for norm blobs", | |
| 163 ccstruct_->params()) | |
| 164 , double_MEMBER(textord_noise_sxfract, 0.4, "xh fract width error for norm blobs", | |
| 165 ccstruct_->params()) | |
| 166 , double_MEMBER(textord_noise_hfract, 1.0 / 64, | |
| 167 "Height fraction to discard outlines as speckle noise", ccstruct_->params()) | |
| 168 , INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row", ccstruct_->params()) | |
| 169 , double_MEMBER(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion", | |
| 170 ccstruct_->params()) | |
| 171 , BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector", ccstruct_->params()) | |
| 172 , double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift", ccstruct_->params()) | |
| 173 , double_MEMBER(textord_blshift_xfraction, 9.99, "Min size of baseline shift", | |
| 174 ccstruct_->params()) {} | |
| 175 | |
| 176 // Make the textlines and words inside each block. | |
| 177 void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, | |
| 178 Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms, | |
| 179 BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, | |
| 180 TO_BLOCK_LIST *to_blocks, float *gradient) { | |
| 181 page_tr_.set_x(width); | |
| 182 page_tr_.set_y(height); | |
| 183 if (to_blocks->empty()) { | |
| 184 // AutoPageSeg was not used, so we need to find_components first. | |
| 185 find_components(binary_pix, blocks, to_blocks); | |
| 186 TO_BLOCK_IT it(to_blocks); | |
| 187 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 188 TO_BLOCK *to_block = it.data(); | |
| 189 // Compute the edge offsets whether or not there is a grey_pix. | |
| 190 // We have by-passed auto page seg, so we have to run it here. | |
| 191 // By page segmentation mode there is no non-text to avoid running on. | |
| 192 to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix); | |
| 193 } | |
| 194 } else if (!PSM_SPARSE(pageseg_mode)) { | |
| 195 // AutoPageSeg does not need to find_components as it did that already. | |
| 196 // Filter_blobs sets up the TO_BLOCKs the same as find_components does. | |
| 197 filter_blobs(page_tr_, to_blocks, true); | |
| 198 } | |
| 199 | |
| 200 ASSERT_HOST(!to_blocks->empty()); | |
| 201 if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) { | |
| 202 const FCOORD anticlockwise90(0.0f, 1.0f); | |
| 203 const FCOORD clockwise90(0.0f, -1.0f); | |
| 204 TO_BLOCK_IT it(to_blocks); | |
| 205 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 206 TO_BLOCK *to_block = it.data(); | |
| 207 BLOCK *block = to_block->block; | |
| 208 // Create a fake poly_block in block from its bounding box. | |
| 209 block->pdblk.set_poly_block(new POLY_BLOCK(block->pdblk.bounding_box(), PT_VERTICAL_TEXT)); | |
| 210 // Rotate the to_block along with its contained block and blobnbox lists. | |
| 211 to_block->rotate(anticlockwise90); | |
| 212 // Set the block's rotation values to obey the convention followed in | |
| 213 // layout analysis for vertical text. | |
| 214 block->set_re_rotation(clockwise90); | |
| 215 block->set_classify_rotation(clockwise90); | |
| 216 } | |
| 217 } | |
| 218 | |
| 219 TO_BLOCK_IT to_block_it(to_blocks); | |
| 220 TO_BLOCK *to_block = to_block_it.data(); | |
| 221 // Make the rows in the block. | |
| 222 // Do it the old fashioned way. | |
| 223 if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { | |
| 224 *gradient = make_rows(page_tr_, to_blocks); | |
| 225 } else if (!PSM_SPARSE(pageseg_mode)) { | |
| 226 // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. | |
| 227 *gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks); | |
| 228 } else { | |
| 229 *gradient = 0.0f; | |
| 230 } | |
| 231 BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks); | |
| 232 baseline_detector.ComputeStraightBaselines(use_box_bottoms); | |
| 233 baseline_detector.ComputeBaselineSplinesAndXheights( | |
| 234 page_tr_, pageseg_mode != PSM_RAW_LINE, textord_heavy_nr, textord_show_final_rows, this); | |
| 235 // Now make the words in the lines. | |
| 236 if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { | |
| 237 // SINGLE_LINE uses the old word maker on the single line. | |
| 238 make_words(this, page_tr_, *gradient, blocks, to_blocks); | |
| 239 } else { | |
| 240 // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a | |
| 241 // single word, and in SINGLE_CHAR mode, all the outlines | |
| 242 // go in a single blob. | |
| 243 TO_BLOCK *to_block = to_block_it.data(); | |
| 244 make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), | |
| 245 to_block->block->row_list()); | |
| 246 } | |
| 247 // Remove empties. | |
| 248 cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); | |
| 249 TransferDiacriticsToBlockGroups(diacritic_blobs, blocks); | |
| 250 // Compute the margins for each row in the block, to be used later for | |
| 251 // paragraph detection. | |
| 252 BLOCK_IT b_it(blocks); | |
| 253 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 254 b_it.data()->compute_row_margins(); | |
| 255 } | |
| 256 #ifndef GRAPHICS_DISABLED | |
| 257 close_to_win(); | |
| 258 #endif | |
| 259 } | |
| 260 | |
| 261 // If we were supposed to return only a single textline, and there is more | |
| 262 // than one, clean up and leave only the best. | |
| 263 void Textord::CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res) { | |
| 264 if (PSM_LINE_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { | |
| 265 return; // No cleanup required. | |
| 266 } | |
| 267 PAGE_RES_IT it(page_res); | |
| 268 // Find the best row, being the greatest mean word conf. | |
| 269 float row_total_conf = 0.0f; | |
| 270 int row_word_count = 0; | |
| 271 ROW_RES *best_row = nullptr; | |
| 272 float best_conf = 0.0f; | |
| 273 for (it.restart_page(); it.word() != nullptr; it.forward()) { | |
| 274 WERD_RES *word = it.word(); | |
| 275 row_total_conf += word->best_choice->certainty(); | |
| 276 ++row_word_count; | |
| 277 if (it.next_row() != it.row()) { | |
| 278 row_total_conf /= row_word_count; | |
| 279 if (best_row == nullptr || best_conf < row_total_conf) { | |
| 280 best_row = it.row(); | |
| 281 best_conf = row_total_conf; | |
| 282 } | |
| 283 row_total_conf = 0.0f; | |
| 284 row_word_count = 0; | |
| 285 } | |
| 286 } | |
| 287 // Now eliminate any word not in the best row. | |
| 288 for (it.restart_page(); it.word() != nullptr; it.forward()) { | |
| 289 if (it.row() != best_row) { | |
| 290 it.DeleteCurrentWord(); | |
| 291 } | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 } // namespace tesseract. |
