Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/tesseractclass.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tesseractclass.cpp | |
| 3 // Description: The Tesseract class. It holds/owns everything needed | |
| 4 // to run Tesseract on a single language, and also a set of | |
| 5 // sub-Tesseracts to run sub-languages. For thread safety, *every* | |
| 6 // variable that was previously global or static (except for | |
| 7 // constant data, and some visual debugging flags) has been moved | |
| 8 // in here, directly, or indirectly. | |
| 9 // This makes it safe to run multiple Tesseracts in different | |
| 10 // threads in parallel, and keeps the different language | |
| 11 // instances separate. | |
| 12 // Some global functions remain, but they are isolated re-entrant | |
| 13 // functions that operate on their arguments. Functions that work | |
| 14 // on variable data have been moved to an appropriate class based | |
| 15 // mostly on the directory hierarchy. For more information see | |
| 16 // slide 6 of "2ArchitectureAndDataStructures" in | |
| 17 // https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing | |
| 18 // Some global data and related functions still exist in the | |
| 19 // training-related code, but they don't interfere with normal | |
| 20 // recognition operation. | |
| 21 // Author: Ray Smith | |
| 22 // | |
| 23 // (C) Copyright 2008, Google Inc. | |
| 24 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 25 // you may not use this file except in compliance with the License. | |
| 26 // You may obtain a copy of the License at | |
| 27 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 28 // Unless required by applicable law or agreed to in writing, software | |
| 29 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 30 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 31 // See the License for the specific language governing permissions and | |
| 32 // limitations under the License. | |
| 33 // | |
| 34 /////////////////////////////////////////////////////////////////////// | |
| 35 | |
| 36 // Include automatically generated configuration file if running autoconf. | |
| 37 #ifdef HAVE_CONFIG_H | |
| 38 # include "config_auto.h" | |
| 39 #endif | |
| 40 | |
| 41 #include "tesseractclass.h" | |
| 42 | |
| 43 #include <allheaders.h> | |
| 44 #include "edgblob.h" | |
| 45 #ifndef DISABLED_LEGACY_ENGINE | |
| 46 # include "equationdetect.h" | |
| 47 #endif | |
| 48 #include "lstmrecognizer.h" | |
| 49 #include "thresholder.h" // for ThresholdMethod | |
| 50 | |
| 51 namespace tesseract { | |
| 52 | |
| 53 Tesseract::Tesseract() | |
| 54 : BOOL_MEMBER(tessedit_resegment_from_boxes, false, | |
| 55 "Take segmentation and labeling from box file", this->params()) | |
| 56 , BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, | |
| 57 "Conversion of word/line box file to char box file", this->params()) | |
| 58 , BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars", | |
| 59 this->params()) | |
| 60 , BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars", | |
| 61 this->params()) | |
| 62 , BOOL_MEMBER(tessedit_train_line_recognizer, false, | |
| 63 "Break input into lines and remap boxes if present", this->params()) | |
| 64 , BOOL_MEMBER(tessedit_dump_pageseg_images, false, | |
| 65 "Dump intermediate images made during page segmentation", this->params()) | |
| 66 // TODO: remove deprecated tessedit_do_invert in release 6. | |
| 67 , BOOL_MEMBER(tessedit_do_invert, true, | |
| 68 "Try inverted line image if necessary (deprecated, will be " | |
| 69 "removed in release 6, use the 'invert_threshold' parameter instead)", | |
| 70 this->params()) | |
| 71 , double_MEMBER(invert_threshold, 0.7, | |
| 72 "For lines with a mean confidence below this value, OCR is also tried with an inverted image", | |
| 73 this->params()) | |
| 74 , | |
| 75 // The default for pageseg_mode is the old behaviour, so as not to | |
| 76 // upset anything that relies on that. | |
| 77 INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, | |
| 78 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, " | |
| 79 "4=column," | |
| 80 " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char," | |
| 81 "11=sparse_text, 12=sparse_text+osd, 13=raw_line" | |
| 82 " (Values from PageSegMode enum in tesseract/publictypes.h)", | |
| 83 this->params()) | |
| 84 , INT_MEMBER(thresholding_method, | |
| 85 static_cast<int>(ThresholdMethod::Otsu), | |
| 86 "Thresholding method: 0 = Otsu, 1 = LeptonicaOtsu, 2 = " | |
| 87 "Sauvola", | |
| 88 this->params()) | |
| 89 , BOOL_MEMBER(thresholding_debug, false, | |
| 90 "Debug the thresholding process", | |
| 91 this->params()) | |
| 92 , double_MEMBER(thresholding_window_size, 0.33, | |
| 93 "Window size for measuring local statistics (to be " | |
| 94 "multiplied by image DPI). " | |
| 95 "This parameter is used by the Sauvola thresholding method", | |
| 96 this->params()) | |
| 97 , double_MEMBER(thresholding_kfactor, 0.34, | |
| 98 "Factor for reducing threshold due to variance. " | |
| 99 "This parameter is used by the Sauvola thresholding method." | |
| 100 " Normal range: 0.2-0.5", | |
| 101 this->params()) | |
| 102 , double_MEMBER(thresholding_tile_size, 0.33, | |
| 103 "Desired tile size (to be multiplied by image DPI). " | |
| 104 "This parameter is used by the LeptonicaOtsu thresholding " | |
| 105 "method", | |
| 106 this->params()) | |
| 107 , double_MEMBER(thresholding_smooth_kernel_size, 0.0, | |
| 108 "Size of convolution kernel applied to threshold array " | |
| 109 "(to be multiplied by image DPI). Use 0 for no smoothing. " | |
| 110 "This parameter is used by the LeptonicaOtsu thresholding " | |
| 111 "method", | |
| 112 this->params()) | |
| 113 , double_MEMBER(thresholding_score_fraction, 0.1, | |
| 114 "Fraction of the max Otsu score. " | |
| 115 "This parameter is used by the LeptonicaOtsu thresholding " | |
| 116 "method. " | |
| 117 "For standard Otsu use 0.0, otherwise 0.1 is recommended", | |
| 118 this->params()) | |
| 119 , INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT, | |
| 120 "Which OCR engine(s) to run (Tesseract, LSTM, both)." | |
| 121 " Defaults to loading and running the most accurate" | |
| 122 " available.", | |
| 123 this->params()) | |
| 124 , STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize", | |
| 125 this->params()) | |
| 126 , STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params()) | |
| 127 , STRING_MEMBER(tessedit_char_unblacklist, "", | |
| 128 "List of chars to override tessedit_char_blacklist", this->params()) | |
| 129 , BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities", | |
| 130 this->params()) | |
| 131 , INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, | |
| 132 "Whether to use the top-line splitting process for Devanagari " | |
| 133 "documents while performing page-segmentation.", | |
| 134 this->params()) | |
| 135 , INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, | |
| 136 "Whether to use the top-line splitting process for Devanagari " | |
| 137 "documents while performing ocr.", | |
| 138 this->params()) | |
| 139 , STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.", | |
| 140 this->params()) | |
| 141 , BOOL_MEMBER(tessedit_adaption_debug, false, | |
| 142 "Generate and print debug" | |
| 143 " information for adaption", | |
| 144 this->params()) | |
| 145 , INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()) | |
| 146 , INT_MEMBER(applybox_debug, 1, "Debug level", this->params()) | |
| 147 , INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params()) | |
| 148 , STRING_MEMBER(applybox_exposure_pattern, ".exp", | |
| 149 "Exposure value follows" | |
| 150 " this pattern in the image filename. The name of the image" | |
| 151 " files are expected to be in the form" | |
| 152 " [lang].[fontname].exp[num].tif", | |
| 153 this->params()) | |
| 154 , BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, | |
| 155 "Learn both character fragments (as is done in the" | |
| 156 " special low exposure mode) as well as unfragmented" | |
| 157 " characters.", | |
| 158 this->params()) | |
| 159 , BOOL_MEMBER(applybox_learn_ngrams_mode, false, | |
| 160 "Each bounding box" | |
| 161 " is assumed to contain ngrams. Only learn the ngrams" | |
| 162 " whose outlines overlap horizontally.", | |
| 163 this->params()) | |
| 164 , BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params()) | |
| 165 , BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params()) | |
| 166 , BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params()) | |
| 167 , BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params()) | |
| 168 , BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility", | |
| 169 this->params()) | |
| 170 , BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params()) | |
| 171 , BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary", | |
| 172 this->params()) | |
| 173 , BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params()) | |
| 174 , INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params()) | |
| 175 , BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params()) | |
| 176 , BOOL_MEMBER(tessedit_enable_bigram_correction, true, | |
| 177 "Enable correction based on the word bigram dictionary.", this->params()) | |
| 178 , BOOL_MEMBER(tessedit_enable_dict_correction, false, | |
| 179 "Enable single word correction based on the dictionary.", this->params()) | |
| 180 , INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.", | |
| 181 this->params()) | |
| 182 , BOOL_MEMBER(enable_noise_removal, true, | |
| 183 "Remove and conditionally reassign small outlines when they" | |
| 184 " confuse layout analysis, determining diacritics vs noise", | |
| 185 this->params()) | |
| 186 , INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params()) | |
| 187 , | |
| 188 // Worst (min) certainty, for which a diacritic is allowed to make the | |
| 189 // base | |
| 190 // character worse and still be included. | |
| 191 double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params()) | |
| 192 , | |
| 193 // Worst (min) certainty, for which a non-overlapping diacritic is allowed | |
| 194 // to make the base character worse and still be included. | |
| 195 double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params()) | |
| 196 , | |
| 197 // Worst (min) certainty, for which a diacritic is allowed to make a new | |
| 198 // stand-alone blob. | |
| 199 double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params()) | |
| 200 , | |
| 201 // Factor of certainty margin for adding diacritics to not count as worse. | |
| 202 double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint", | |
| 203 this->params()) | |
| 204 , INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params()) | |
| 205 , INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params()) | |
| 206 , INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()) | |
| 207 , STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params()) | |
| 208 , STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params()) | |
| 209 , STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params()) | |
| 210 , double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params()) | |
| 211 , double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params()) | |
| 212 , double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit", | |
| 213 this->params()) | |
| 214 , double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params()) | |
| 215 , INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params()) | |
| 216 , INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess", | |
| 217 this->params()) | |
| 218 , BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output", | |
| 219 this->params()) | |
| 220 , BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params()) | |
| 221 , BOOL_MEMBER(test_pt, false, "Test for point", this->params()) | |
| 222 , double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()) | |
| 223 , double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()) | |
| 224 , INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params()) | |
| 225 , INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params()) | |
| 226 , BOOL_MEMBER(paragraph_text_based, true, | |
| 227 "Run paragraph detection on the post-text-recognition " | |
| 228 "(more accurate)", | |
| 229 this->params()) | |
| 230 , BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params()) | |
| 231 , STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params()) | |
| 232 , STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params()) | |
| 233 , BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs", | |
| 234 this->params()) | |
| 235 , BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params()) | |
| 236 , double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc", | |
| 237 this->params()) | |
| 238 , double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block", | |
| 239 this->params()) | |
| 240 , double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row", | |
| 241 this->params()) | |
| 242 , double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, | |
| 243 "Number of row rejects in whole word rejects" | |
| 244 " which prevents whole row rejection", | |
| 245 this->params()) | |
| 246 , BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, | |
| 247 "Only rej partially rejected words in block rejection", this->params()) | |
| 248 , BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, | |
| 249 "Only rej partially rejected words in row rejection", this->params()) | |
| 250 , BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric", | |
| 251 this->params()) | |
| 252 , BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric", | |
| 253 this->params()) | |
| 254 , INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this", | |
| 255 this->params()) | |
| 256 , BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs", | |
| 257 this->params()) | |
| 258 , double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, | |
| 259 "rej good doc wd if more than this fraction rejected", this->params()) | |
| 260 , BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params()) | |
| 261 , BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params()) | |
| 262 , BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file", | |
| 263 this->params()) | |
| 264 , BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params()) | |
| 265 , double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params()) | |
| 266 , BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params()) | |
| 267 , BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params()) | |
| 268 , BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output", | |
| 269 this->params()) | |
| 270 , BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params()) | |
| 271 , BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params()) | |
| 272 , double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params()) | |
| 273 , BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()) | |
| 274 , double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params()) | |
| 275 , double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params()) | |
| 276 , double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params()) | |
| 277 , double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params()) | |
| 278 , double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params()) | |
| 279 , double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params()) | |
| 280 , double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params()) | |
| 281 , double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params()) | |
| 282 , double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params()) | |
| 283 , double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params()) | |
| 284 , double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params()) | |
| 285 , double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params()) | |
| 286 , INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params()) | |
| 287 , INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params()) | |
| 288 , BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params()) | |
| 289 , BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params()) | |
| 290 , BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings", | |
| 291 this->params()) | |
| 292 , BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params()) | |
| 293 , INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings", | |
| 294 this->params()) | |
| 295 , INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings", | |
| 296 this->params()) | |
| 297 , INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params()) | |
| 298 , INT_MEMBER(crunch_debug, 0, "As it says", this->params()) | |
| 299 , INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params()) | |
| 300 , double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params()) | |
| 301 , BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params()) | |
| 302 , INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params()) | |
| 303 , INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params()) | |
| 304 , STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params()) | |
| 305 , INT_MEMBER(x_ht_acceptance_tolerance, 8, | |
| 306 "Max allowed deviation of blob top outside of font data", this->params()) | |
| 307 , INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params()) | |
| 308 , INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params()) | |
| 309 , double_MEMBER(superscript_worse_certainty, 2.0, | |
| 310 "How many times worse " | |
| 311 "certainty does a superscript position glyph need to be for " | |
| 312 "us to try classifying it as a char with a different " | |
| 313 "baseline?", | |
| 314 this->params()) | |
| 315 , double_MEMBER(superscript_bettered_certainty, 0.97, | |
| 316 "What reduction in " | |
| 317 "badness do we think sufficient to choose a superscript " | |
| 318 "over what we'd thought. For example, a value of 0.6 means " | |
| 319 "we want to reduce badness of certainty by at least 40%", | |
| 320 this->params()) | |
| 321 , double_MEMBER(superscript_scaledown_ratio, 0.4, | |
| 322 "A superscript scaled down more than this is unbelievably " | |
| 323 "small. For example, 0.3 means we expect the font size to " | |
| 324 "be no smaller than 30% of the text line font size.", | |
| 325 this->params()) | |
| 326 , double_MEMBER(subscript_max_y_top, 0.5, | |
| 327 "Maximum top of a character measured as a multiple of " | |
| 328 "x-height above the baseline for us to reconsider whether " | |
| 329 "it's a subscript.", | |
| 330 this->params()) | |
| 331 , double_MEMBER(superscript_min_y_bottom, 0.3, | |
| 332 "Minimum bottom of a character measured as a multiple of " | |
| 333 "x-height above the baseline for us to reconsider whether " | |
| 334 "it's a superscript.", | |
| 335 this->params()) | |
| 336 , BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output", | |
| 337 this->params()) | |
| 338 , BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params()) | |
| 339 , BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params()) | |
| 340 , BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params()) | |
| 341 , BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params()) | |
| 342 , BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params()) | |
| 343 , BOOL_MEMBER(tessedit_create_page_xml, false, "Write .page.xml PAGE file", this->params()) | |
| 344 , BOOL_MEMBER(page_xml_polygon, true, "Create the PAGE file with polygons instead of box values", this->params()) | |
| 345 , INT_MEMBER(page_xml_level, 0, "Create the PAGE file on 0=line or 1=word level.", this->params()) | |
| 346 , BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training", | |
| 347 this->params()) | |
| 348 , BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params()) | |
| 349 , BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file", | |
| 350 this->params()) | |
| 351 , BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()) | |
| 352 , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer", | |
| 353 this->params()) | |
| 354 , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()) | |
| 355 , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params()) | |
| 356 , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD", | |
| 357 this->params()) | |
| 358 , STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params()) | |
| 359 , INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()) | |
| 360 , INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params()) | |
| 361 , BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params()) | |
| 362 , double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params()) | |
| 363 , double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params()) | |
| 364 , BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params()) | |
| 365 , BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params()) | |
| 366 , BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD", | |
| 367 this->params()) | |
| 368 , BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL", | |
| 369 this->params()) | |
| 370 , INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()) | |
| 371 , BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params()) | |
| 372 , BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params()) | |
| 373 , double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params()) | |
| 374 , double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params()) | |
| 375 , BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params()) | |
| 376 , BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params()) | |
| 377 , BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params()) | |
| 378 , BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params()) | |
| 379 , BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params()) | |
| 380 , BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params()) | |
| 381 , BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params()) | |
| 382 , BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params()) | |
| 383 , double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params()) | |
| 384 , INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params()) | |
| 385 , STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params()) | |
| 386 , STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params()) | |
| 387 , INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params()) | |
| 388 , BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params()) | |
| 389 , INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process", | |
| 390 this->params()) | |
| 391 , BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params()) | |
| 392 , BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params()) | |
| 393 , STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()) | |
| 394 , BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params()) | |
| 395 , STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one", | |
| 396 this->params()) | |
| 397 , BOOL_MEMBER(tessedit_use_primary_params_model, false, | |
| 398 "In multilingual mode use params model of the" | |
| 399 " primary language", | |
| 400 this->params()) | |
| 401 , double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin", | |
| 402 this->params()) | |
| 403 , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params()) | |
| 404 , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params()) | |
| 405 , BOOL_MEMBER(poly_allow_detailed_fx, false, | |
| 406 "Allow feature extractors to see the original outline", this->params()) | |
| 407 , BOOL_INIT_MEMBER(tessedit_init_config_only, false, | |
| 408 "Only initialize with the config file. Useful if the " | |
| 409 "instance is not going to be used for OCR but say only " | |
| 410 "for layout analysis.", | |
| 411 this->params()) | |
| 412 #ifndef DISABLED_LEGACY_ENGINE | |
| 413 , BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params()) | |
| 414 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 415 , BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params()) | |
| 416 , BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode", | |
| 417 this->params()) | |
| 418 , double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5, | |
| 419 "Fraction of textlines deemed vertical to use vertical page " | |
| 420 "mode", | |
| 421 this->params()) | |
| 422 , double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75, | |
| 423 "Fraction of height used as a minimum gap for aligned blobs.", this->params()) | |
| 424 , INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params()) | |
| 425 , BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces", | |
| 426 this->params()) | |
| 427 , STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)", | |
| 428 this->params()) | |
| 429 , INT_MEMBER(lstm_choice_mode, 0, | |
| 430 "Allows to include alternative symbols choices in the hOCR output. " | |
| 431 "Valid input values are 0, 1 and 2. 0 is the default value. " | |
| 432 "With 1 the alternative symbol choices per timestep are included. " | |
| 433 "With 2 alternative symbol choices are extracted from the CTC " | |
| 434 "process instead of the lattice. The choices are mapped per " | |
| 435 "character.", | |
| 436 this->params()) | |
| 437 , INT_MEMBER(lstm_choice_iterations, 5, | |
| 438 "Sets the number of cascading iterations for the Beamsearch in " | |
| 439 "lstm_choice_mode. Note that lstm_choice_mode must be set to a " | |
| 440 "value greater than 0 to produce results.", | |
| 441 this->params()) | |
| 442 , double_MEMBER(lstm_rating_coefficient, 5, | |
| 443 "Sets the rating coefficient for the lstm choices. The smaller the " | |
| 444 "coefficient, the better are the ratings for each choice and less " | |
| 445 "information is lost due to the cut off at 0. The standard value is " | |
| 446 "5", | |
| 447 this->params()) | |
| 448 , BOOL_MEMBER(pageseg_apply_music_mask, false, | |
| 449 "Detect music staff and remove intersecting components", this->params()) | |
| 450 , | |
| 451 | |
| 452 backup_config_file_(nullptr) | |
| 453 , pix_binary_(nullptr) | |
| 454 , pix_grey_(nullptr) | |
| 455 , pix_original_(nullptr) | |
| 456 , pix_thresholds_(nullptr) | |
| 457 , source_resolution_(0) | |
| 458 , textord_(this) | |
| 459 , right_to_left_(false) | |
| 460 , scaled_color_(nullptr) | |
| 461 , scaled_factor_(-1) | |
| 462 , deskew_(1.0f, 0.0f) | |
| 463 , reskew_(1.0f, 0.0f) | |
| 464 , gradient_(0.0f) | |
| 465 , most_recently_used_(this) | |
| 466 , font_table_size_(0) | |
| 467 #ifndef DISABLED_LEGACY_ENGINE | |
| 468 , equ_detect_(nullptr) | |
| 469 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 470 , lstm_recognizer_(nullptr) | |
| 471 , train_line_page_num_(0) {} | |
| 472 | |
| 473 Tesseract::~Tesseract() { | |
| 474 Clear(); | |
| 475 pix_original_.destroy(); | |
| 476 end_tesseract(); | |
| 477 for (auto *lang : sub_langs_) { | |
| 478 delete lang; | |
| 479 } | |
| 480 delete lstm_recognizer_; | |
| 481 lstm_recognizer_ = nullptr; | |
| 482 } | |
| 483 | |
| 484 Dict &Tesseract::getDict() { | |
| 485 if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) { | |
| 486 if (lstm_recognizer_ && lstm_recognizer_->GetDict()) { | |
| 487 return *lstm_recognizer_->GetDict(); | |
| 488 } | |
| 489 } | |
| 490 return Classify::getDict(); | |
| 491 } | |
| 492 | |
| 493 void Tesseract::Clear() { | |
| 494 std::string debug_name = imagebasename + "_debug.pdf"; | |
| 495 pixa_debug_.WritePDF(debug_name.c_str()); | |
| 496 pix_binary_.destroy(); | |
| 497 pix_grey_.destroy(); | |
| 498 pix_thresholds_.destroy(); | |
| 499 scaled_color_.destroy(); | |
| 500 deskew_ = FCOORD(1.0f, 0.0f); | |
| 501 reskew_ = FCOORD(1.0f, 0.0f); | |
| 502 gradient_ = 0.0f; | |
| 503 splitter_.Clear(); | |
| 504 scaled_factor_ = -1; | |
| 505 for (auto &sub_lang : sub_langs_) { | |
| 506 sub_lang->Clear(); | |
| 507 } | |
| 508 } | |
| 509 | |
| 510 #ifndef DISABLED_LEGACY_ENGINE | |
| 511 | |
| 512 void Tesseract::SetEquationDetect(EquationDetect *detector) { | |
| 513 equ_detect_ = detector; | |
| 514 equ_detect_->SetLangTesseract(this); | |
| 515 } | |
| 516 | |
| 517 // Clear all memory of adaption for this and all subclassifiers. | |
| 518 void Tesseract::ResetAdaptiveClassifier() { | |
| 519 ResetAdaptiveClassifierInternal(); | |
| 520 for (auto &sub_lang : sub_langs_) { | |
| 521 sub_lang->ResetAdaptiveClassifierInternal(); | |
| 522 } | |
| 523 } | |
| 524 | |
| 525 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 526 | |
| 527 // Clear the document dictionary for this and all subclassifiers. | |
| 528 void Tesseract::ResetDocumentDictionary() { | |
| 529 getDict().ResetDocumentDictionary(); | |
| 530 for (auto &sub_lang : sub_langs_) { | |
| 531 sub_lang->getDict().ResetDocumentDictionary(); | |
| 532 } | |
| 533 } | |
| 534 | |
| 535 void Tesseract::SetBlackAndWhitelist() { | |
| 536 // Set the white and blacklists (if any) | |
| 537 unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(), | |
| 538 tessedit_char_whitelist.c_str(), | |
| 539 tessedit_char_unblacklist.c_str()); | |
| 540 if (lstm_recognizer_) { | |
| 541 UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset(); | |
| 542 lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(), | |
| 543 tessedit_char_whitelist.c_str(), | |
| 544 tessedit_char_unblacklist.c_str()); | |
| 545 } | |
| 546 // Black and white lists should apply to all loaded classifiers. | |
| 547 for (auto &sub_lang : sub_langs_) { | |
| 548 sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(), | |
| 549 tessedit_char_whitelist.c_str(), | |
| 550 tessedit_char_unblacklist.c_str()); | |
| 551 if (sub_lang->lstm_recognizer_) { | |
| 552 UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset(); | |
| 553 lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(), | |
| 554 tessedit_char_whitelist.c_str(), | |
| 555 tessedit_char_unblacklist.c_str()); | |
| 556 } | |
| 557 } | |
| 558 } | |
| 559 | |
| 560 // Perform steps to prepare underlying binary image/other data structures for | |
| 561 // page segmentation. | |
| 562 void Tesseract::PrepareForPageseg() { | |
| 563 textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model); | |
| 564 // Find the max splitter strategy over all langs. | |
| 565 auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( | |
| 566 static_cast<int32_t>(pageseg_devanagari_split_strategy)); | |
| 567 for (auto &sub_lang : sub_langs_) { | |
| 568 auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( | |
| 569 static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy)); | |
| 570 if (pageseg_strategy > max_pageseg_strategy) { | |
| 571 max_pageseg_strategy = pageseg_strategy; | |
| 572 } | |
| 573 sub_lang->pix_binary_.destroy(); | |
| 574 sub_lang->pix_binary_ = pix_binary().clone(); | |
| 575 } | |
| 576 // Perform shiro-rekha (top-line) splitting and replace the current image by | |
| 577 // the newly split image. | |
| 578 splitter_.set_orig_pix(pix_binary()); | |
| 579 splitter_.set_pageseg_split_strategy(max_pageseg_strategy); | |
| 580 if (splitter_.Split(true, &pixa_debug_)) { | |
| 581 ASSERT_HOST(splitter_.splitted_image()); | |
| 582 pix_binary_.destroy(); | |
| 583 pix_binary_ = splitter_.splitted_image().clone(); | |
| 584 } | |
| 585 } | |
| 586 | |
| 587 // Perform steps to prepare underlying binary image/other data structures for | |
| 588 // OCR. The current segmentation is required by this method. | |
| 589 // Note that this method resets pix_binary_ to the original binarized image, | |
| 590 // which may be different from the image actually used for OCR depending on the | |
| 591 // value of devanagari_ocr_split_strategy. | |
| 592 void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) { | |
| 593 // Find the max splitter strategy over all langs. | |
| 594 auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( | |
| 595 static_cast<int32_t>(ocr_devanagari_split_strategy)); | |
| 596 for (auto &sub_lang : sub_langs_) { | |
| 597 auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( | |
| 598 static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy)); | |
| 599 if (ocr_strategy > max_ocr_strategy) { | |
| 600 max_ocr_strategy = ocr_strategy; | |
| 601 } | |
| 602 } | |
| 603 // Utilize the segmentation information available. | |
| 604 splitter_.set_segmentation_block_list(block_list); | |
| 605 splitter_.set_ocr_split_strategy(max_ocr_strategy); | |
| 606 // Run the splitter for OCR | |
| 607 bool split_for_ocr = splitter_.Split(false, &pixa_debug_); | |
| 608 // Restore pix_binary to the binarized original pix for future reference. | |
| 609 ASSERT_HOST(splitter_.orig_pix()); | |
| 610 pix_binary_.destroy(); | |
| 611 pix_binary_ = splitter_.orig_pix().clone(); | |
| 612 // If the pageseg and ocr strategies are different, refresh the block list | |
| 613 // (from the last SegmentImage call) with blobs from the real image to be used | |
| 614 // for OCR. | |
| 615 if (splitter_.HasDifferentSplitStrategies()) { | |
| 616 BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_)); | |
| 617 Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix(); | |
| 618 extract_edges(pix_for_ocr, &block); | |
| 619 splitter_.RefreshSegmentationWithNewBlobs(block.blob_list()); | |
| 620 } | |
| 621 // The splitter isn't needed any more after this, so save memory by clearing. | |
| 622 splitter_.Clear(); | |
| 623 } | |
| 624 | |
| 625 } // namespace tesseract |
