Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/tesseractclass.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tesseractclass.h | |
| 3 // Description: The Tesseract class. It holds/owns everything needed | |
| 4 // to run Tesseract on a single language, and also a set of | |
| 5 // sub-Tesseracts to run sub-languages. For thread safety, *every* | |
| 6 // global variable goes in here, directly, or indirectly. | |
| 7 // This makes it safe to run multiple Tesseracts in different | |
| 8 // threads in parallel, and keeps the different language | |
| 9 // instances separate. | |
| 10 // Author: Ray Smith | |
| 11 // | |
| 12 // (C) Copyright 2008, Google Inc. | |
| 13 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 14 // you may not use this file except in compliance with the License. | |
| 15 // You may obtain a copy of the License at | |
| 16 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 17 // Unless required by applicable law or agreed to in writing, software | |
| 18 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 19 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 20 // See the License for the specific language governing permissions and | |
| 21 // limitations under the License. | |
| 22 // | |
| 23 /////////////////////////////////////////////////////////////////////// | |
| 24 | |
| 25 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_ | |
| 26 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_ | |
| 27 | |
| 28 #ifdef HAVE_CONFIG_H | |
| 29 # include "config_auto.h" // DISABLED_LEGACY_ENGINE | |
| 30 #endif | |
| 31 | |
| 32 #include "control.h" // for ACCEPTABLE_WERD_TYPE | |
| 33 #include "debugpixa.h" // for DebugPixa | |
| 34 #include "devanagari_processing.h" // for ShiroRekhaSplitter | |
| 35 #ifndef DISABLED_LEGACY_ENGINE | |
| 36 # include "docqual.h" // for GARBAGE_LEVEL | |
| 37 #endif | |
| 38 #include "genericvector.h" // for PointerVector | |
| 39 #include "pageres.h" // for WERD_RES (ptr only), PAGE_RES (pt... | |
| 40 #include "params.h" // for BOOL_VAR_H, BoolParam, DoubleParam | |
| 41 #include "points.h" // for FCOORD | |
| 42 #include "ratngs.h" // for ScriptPos, WERD_CHOICE (ptr only) | |
| 43 #include "tessdatamanager.h" // for TessdataManager | |
| 44 #include "textord.h" // for Textord | |
| 45 #include "wordrec.h" // for Wordrec | |
| 46 | |
| 47 #include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L... | |
| 48 #include <tesseract/unichar.h> // for UNICHAR_ID | |
| 49 | |
| 50 #include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe... | |
| 51 | |
| 52 #include <cstdint> // for int16_t, int32_t, uint16_t | |
| 53 #include <cstdio> // for FILE | |
| 54 | |
| 55 namespace tesseract { | |
| 56 | |
| 57 class BLOCK_LIST; | |
| 58 class ETEXT_DESC; | |
| 59 struct OSResults; | |
| 60 class PAGE_RES; | |
| 61 class PAGE_RES_IT; | |
| 62 class ROW; | |
| 63 class SVMenuNode; | |
| 64 class TBOX; | |
| 65 class TO_BLOCK_LIST; | |
| 66 class WERD; | |
| 67 class WERD_CHOICE; | |
| 68 class WERD_RES; | |
| 69 | |
| 70 class ColumnFinder; | |
| 71 class DocumentData; | |
| 72 #ifndef DISABLED_LEGACY_ENGINE | |
| 73 class EquationDetect; | |
| 74 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 75 class ImageData; | |
| 76 class LSTMRecognizer; | |
| 77 class Tesseract; | |
| 78 | |
| 79 // Top-level class for all tesseract global instance data. | |
| 80 // This class either holds or points to all data used by an instance | |
| 81 // of Tesseract, including the memory allocator. When this is | |
| 82 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT! | |
| 83 // | |
| 84 // NOTE to developers: Do not create cyclic dependencies through this class! | |
| 85 // The directory dependency tree must remain a tree! To keep this clean, | |
| 86 // lower-level code (eg in ccutil, the bottom level) must never need to | |
| 87 // know about the content of a higher-level directory. | |
| 88 // The following scheme will grant the easiest access to lower-level | |
| 89 // global members without creating a cyclic dependency: | |
| 90 // | |
| 91 // Class Hierarchy (^ = inheritance): | |
| 92 // | |
| 93 // CCUtil (ccutil/ccutil.h) | |
| 94 // ^ Members include: UNICHARSET | |
| 95 // CCStruct (ccstruct/ccstruct.h) | |
| 96 // ^ Members include: Image | |
| 97 // Classify (classify/classify.h) | |
| 98 // ^ Members include: Dict | |
| 99 // WordRec (wordrec/wordrec.h) | |
| 100 // ^ Members include: WERD*, DENORM* | |
| 101 // Tesseract (ccmain/tesseractclass.h) | |
| 102 // Members include: Pix* | |
| 103 // | |
| 104 // Other important classes: | |
| 105 // | |
| 106 // TessBaseAPI (tesseract/baseapi.h) | |
| 107 // Members include: BLOCK_LIST*, PAGE_RES*, | |
| 108 // Tesseract*, ImageThresholder* | |
| 109 // Dict (dict/dict.h) | |
| 110 // Members include: Image* (private) | |
| 111 // | |
| 112 // NOTE: that each level contains members that correspond to global | |
| 113 // data that is defined (and used) at that level, not necessarily where | |
| 114 // the type is defined so for instance: | |
| 115 // BOOL_VAR_H(textord_show_blobs); | |
| 116 // goes inside the Textord class, not the cc_util class. | |
| 117 | |
| 118 // A collection of various variables for statistics and debugging. | |
| 119 struct TesseractStats { | |
| 120 TesseractStats() | |
| 121 : adaption_word_number(0) | |
| 122 , doc_blob_quality(0) | |
| 123 , doc_outline_errs(0) | |
| 124 , doc_char_quality(0) | |
| 125 , good_char_count(0) | |
| 126 , doc_good_char_quality(0) | |
| 127 , word_count(0) | |
| 128 , dict_words(0) | |
| 129 , tilde_crunch_written(false) | |
| 130 , last_char_was_newline(true) | |
| 131 , last_char_was_tilde(false) | |
| 132 , write_results_empty_block(true) {} | |
| 133 | |
| 134 int32_t adaption_word_number; | |
| 135 int16_t doc_blob_quality; | |
| 136 int16_t doc_outline_errs; | |
| 137 int16_t doc_char_quality; | |
| 138 int16_t good_char_count; | |
| 139 int16_t doc_good_char_quality; | |
| 140 int32_t word_count; // count of word in the document | |
| 141 int32_t dict_words; // number of dicitionary words in the document | |
| 142 std::string dump_words_str; // accumulator used by dump_words() | |
| 143 // Flags used by write_results() | |
| 144 bool tilde_crunch_written; | |
| 145 bool last_char_was_newline; | |
| 146 bool last_char_was_tilde; | |
| 147 bool write_results_empty_block; | |
| 148 }; | |
| 149 | |
| 150 // Struct to hold all the pointers to relevant data for processing a word. | |
| 151 struct WordData { | |
| 152 WordData() : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {} | |
| 153 explicit WordData(const PAGE_RES_IT &page_res_it) | |
| 154 : word(page_res_it.word()) | |
| 155 , row(page_res_it.row()->row) | |
| 156 , block(page_res_it.block()->block) | |
| 157 , prev_word(nullptr) {} | |
| 158 WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res) | |
| 159 : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {} | |
| 160 | |
| 161 WERD_RES *word; | |
| 162 ROW *row; | |
| 163 BLOCK *block; | |
| 164 WordData *prev_word; | |
| 165 PointerVector<WERD_RES> lang_words; | |
| 166 }; | |
| 167 | |
| 168 // Definition of a Tesseract WordRecognizer. The WordData provides the context | |
| 169 // of row/block, in_word holds an initialized, possibly pre-classified word, | |
| 170 // that the recognizer may or may not consume (but if so it sets | |
| 171 // *in_word=nullptr) and produces one or more output words in out_words, which | |
| 172 // may be the consumed in_word, or may be generated independently. This api | |
| 173 // allows both a conventional tesseract classifier to work, or a line-level | |
| 174 // classifier that generates multiple words from a merged input. | |
| 175 using WordRecognizer = void (Tesseract::*)(const WordData &, WERD_RES **, | |
| 176 PointerVector<WERD_RES> *); | |
| 177 | |
| 178 class TESS_API Tesseract : public Wordrec { | |
| 179 public: | |
| 180 Tesseract(); | |
| 181 ~Tesseract() override; | |
| 182 | |
| 183 // Return appropriate dictionary | |
| 184 Dict &getDict() override; | |
| 185 | |
| 186 // Clear as much used memory as possible without resetting the adaptive | |
| 187 // classifier or losing any other classifier data. | |
| 188 void Clear(); | |
| 189 // Clear all memory of adaption for this and all subclassifiers. | |
| 190 void ResetAdaptiveClassifier(); | |
| 191 // Clear the document dictionary for this and all subclassifiers. | |
| 192 void ResetDocumentDictionary(); | |
| 193 | |
| 194 #ifndef DISABLED_LEGACY_ENGINE | |
| 195 // Set the equation detector. | |
| 196 void SetEquationDetect(EquationDetect *detector); | |
| 197 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 198 | |
| 199 // Simple accessors. | |
| 200 const FCOORD &reskew() const { | |
| 201 return reskew_; | |
| 202 } | |
| 203 float gradient() const { | |
| 204 return gradient_; | |
| 205 } | |
| 206 // Destroy any existing pix and return a pointer to the pointer. | |
| 207 Image *mutable_pix_binary() { | |
| 208 pix_binary_.destroy(); | |
| 209 return &pix_binary_; | |
| 210 } | |
| 211 Image pix_binary() const { | |
| 212 return pix_binary_; | |
| 213 } | |
| 214 Image pix_grey() const { | |
| 215 return pix_grey_; | |
| 216 } | |
| 217 void set_pix_grey(Image grey_pix) { | |
| 218 pix_grey_.destroy(); | |
| 219 pix_grey_ = grey_pix; | |
| 220 } | |
| 221 Image pix_original() const { | |
| 222 return pix_original_; | |
| 223 } | |
| 224 // Takes ownership of the given original_pix. | |
| 225 void set_pix_original(Image original_pix) { | |
| 226 pix_original_.destroy(); | |
| 227 pix_original_ = original_pix; | |
| 228 // Clone to sublangs as well. | |
| 229 for (auto &lang : sub_langs_) { | |
| 230 lang->set_pix_original(original_pix ? original_pix.clone() : nullptr); | |
| 231 } | |
| 232 } | |
| 233 // Returns a pointer to a Pix representing the best available resolution image | |
| 234 // of the page, with best available bit depth as second priority. Result can | |
| 235 // be of any bit depth, but never color-mapped, as that has always been | |
| 236 // removed. Note that in grey and color, 0 is black and 255 is | |
| 237 // white. If the input was binary, then black is 1 and white is 0. | |
| 238 // To tell the difference pixGetDepth() will return 32, 8 or 1. | |
| 239 // In any case, the return value is a borrowed Pix, and should not be | |
| 240 // deleted or pixDestroyed. | |
| 241 Image BestPix() const { | |
| 242 if (pixGetWidth(pix_original_) == ImageWidth()) { | |
| 243 return pix_original_; | |
| 244 } else if (pix_grey_ != nullptr) { | |
| 245 return pix_grey_; | |
| 246 } else { | |
| 247 return pix_binary_; | |
| 248 } | |
| 249 } | |
| 250 void set_pix_thresholds(Image thresholds) { | |
| 251 pix_thresholds_.destroy(); | |
| 252 pix_thresholds_ = thresholds; | |
| 253 } | |
| 254 int source_resolution() const { | |
| 255 return source_resolution_; | |
| 256 } | |
| 257 void set_source_resolution(int ppi) { | |
| 258 source_resolution_ = ppi; | |
| 259 } | |
| 260 int ImageWidth() const { | |
| 261 return pixGetWidth(pix_binary_); | |
| 262 } | |
| 263 int ImageHeight() const { | |
| 264 return pixGetHeight(pix_binary_); | |
| 265 } | |
| 266 Image scaled_color() const { | |
| 267 return scaled_color_; | |
| 268 } | |
| 269 int scaled_factor() const { | |
| 270 return scaled_factor_; | |
| 271 } | |
| 272 void SetScaledColor(int factor, Image color) { | |
| 273 scaled_factor_ = factor; | |
| 274 scaled_color_ = color; | |
| 275 } | |
| 276 const Textord &textord() const { | |
| 277 return textord_; | |
| 278 } | |
| 279 Textord *mutable_textord() { | |
| 280 return &textord_; | |
| 281 } | |
| 282 | |
| 283 bool right_to_left() const { | |
| 284 return right_to_left_; | |
| 285 } | |
| 286 int num_sub_langs() const { | |
| 287 return sub_langs_.size(); | |
| 288 } | |
| 289 Tesseract *get_sub_lang(int index) const { | |
| 290 return sub_langs_[index]; | |
| 291 } | |
| 292 // Returns true if any language uses Tesseract (as opposed to LSTM). | |
| 293 bool AnyTessLang() const { | |
| 294 if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) { | |
| 295 return true; | |
| 296 } | |
| 297 for (auto &lang : sub_langs_) { | |
| 298 if (lang->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) { | |
| 299 return true; | |
| 300 } | |
| 301 } | |
| 302 return false; | |
| 303 } | |
| 304 // Returns true if any language uses the LSTM. | |
| 305 bool AnyLSTMLang() const { | |
| 306 if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) { | |
| 307 return true; | |
| 308 } | |
| 309 for (auto &lang : sub_langs_) { | |
| 310 if (lang->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) { | |
| 311 return true; | |
| 312 } | |
| 313 } | |
| 314 return false; | |
| 315 } | |
| 316 | |
| 317 void SetBlackAndWhitelist(); | |
| 318 | |
| 319 // Perform steps to prepare underlying binary image/other data structures for | |
| 320 // page segmentation. Uses the strategy specified in the global variable | |
| 321 // pageseg_devanagari_split_strategy for perform splitting while preparing for | |
| 322 // page segmentation. | |
| 323 void PrepareForPageseg(); | |
| 324 | |
| 325 // Perform steps to prepare underlying binary image/other data structures for | |
| 326 // Tesseract OCR. The current segmentation is required by this method. | |
| 327 // Uses the strategy specified in the global variable | |
| 328 // ocr_devanagari_split_strategy for performing splitting while preparing for | |
| 329 // Tesseract ocr. | |
| 330 void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr); | |
| 331 | |
| 332 int SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr); | |
| 333 void SetupWordScripts(BLOCK_LIST *blocks); | |
| 334 int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, | |
| 335 BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr); | |
| 336 ColumnFinder *SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, | |
| 337 Tesseract *osd_tess, OSResults *osr, | |
| 338 TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix, | |
| 339 Image *music_mask_pix); | |
| 340 // par_control.cpp | |
| 341 void PrerecAllWordsPar(const std::vector<WordData> &words); | |
| 342 | |
| 343 //// linerec.cpp | |
| 344 // Generates training data for training a line recognizer, eg LSTM. | |
| 345 // Breaks the page into lines, according to the boxes, and writes them to a | |
| 346 // serialized DocumentData based on output_basename. | |
| 347 // Return true if successful, false if an error occurred. | |
| 348 bool TrainLineRecognizer(const char *input_imagename, const std::string &output_basename, | |
| 349 BLOCK_LIST *block_list); | |
| 350 // Generates training data for training a line recognizer, eg LSTM. | |
| 351 // Breaks the boxes into lines, normalizes them, converts to ImageData and | |
| 352 // appends them to the given training_data. | |
| 353 void TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts, | |
| 354 BLOCK_LIST *block_list, DocumentData *training_data); | |
| 355 | |
| 356 // Returns an Imagedata containing the image of the given textline, | |
| 357 // and ground truth boxes/truth text if available in the input. | |
| 358 // The image is not normalized in any way. | |
| 359 ImageData *GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes, | |
| 360 const std::vector<std::string> &texts, int start_box, int end_box, | |
| 361 const BLOCK &block); | |
| 362 // Helper gets the image of a rectangle, using the block.re_rotation() if | |
| 363 // needed to get to the image, and rotating the result back to horizontal | |
| 364 // layout. (CJK characters will be on their left sides) The vertical text flag | |
| 365 // is set in the returned ImageData if the text was originally vertical, which | |
| 366 // can be used to invoke a different CJK recognition engine. The revised_box | |
| 367 // is also returned to enable calculation of output bounding boxes. | |
| 368 ImageData *GetRectImage(const TBOX &box, const BLOCK &block, int padding, | |
| 369 TBOX *revised_box) const; | |
| 370 // Recognizes a word or group of words, converting to WERD_RES in *words. | |
| 371 // Analogous to classify_word_pass1, but can handle a group of words as well. | |
| 372 void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, | |
| 373 PointerVector<WERD_RES> *words); | |
| 374 // Apply segmentation search to the given set of words, within the constraints | |
| 375 // of the existing ratings matrix. If there is already a best_choice on a word | |
| 376 // leaves it untouched and just sets the done/accepted etc flags. | |
| 377 void SearchWords(PointerVector<WERD_RES> *words); | |
| 378 | |
| 379 //// control.h ///////////////////////////////////////////////////////// | |
| 380 bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, | |
| 381 int pass); | |
| 382 // Sets up the words ready for whichever engine is to be run | |
| 383 void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, | |
| 384 PAGE_RES *page_res, std::vector<WordData> *words); | |
| 385 // Sets up the single word ready for whichever engine is to be run. | |
| 386 void SetupWordPassN(int pass_n, WordData *word); | |
| 387 // Runs word recognition on all the words. | |
| 388 bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, | |
| 389 std::vector<WordData> *words); | |
| 390 bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, | |
| 391 const char *word_config, int dopasses); | |
| 392 void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, | |
| 393 const char *word_config); | |
| 394 void bigram_correction_pass(PAGE_RES *page_res); | |
| 395 void blamer_pass(PAGE_RES *page_res); | |
| 396 // Sets script positions and detects smallcaps on all output words. | |
| 397 void script_pos_pass(PAGE_RES *page_res); | |
| 398 // Helper to recognize the word using the given (language-specific) tesseract. | |
| 399 // Returns positive if this recognizer found more new best words than the | |
| 400 // number kept from best_words. | |
| 401 int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, | |
| 402 WERD_RES **in_word, PointerVector<WERD_RES> *best_words); | |
| 403 // Moves good-looking "noise"/diacritics from the reject list to the main | |
| 404 // blob list on the current word. Returns true if anything was done, and | |
| 405 // sets make_next_word_fuzzy if blob(s) were added to the end of the word. | |
| 406 bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy); | |
| 407 // Attempts to put noise/diacritic outlines into the blobs that they overlap. | |
| 408 // Input: a set of noisy outlines that probably belong to the real_word. | |
| 409 // Output: outlines that overlapped blobs are set to nullptr and put back into | |
| 410 // the word, either in the blobs or in the reject list. | |
| 411 void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass, | |
| 412 WERD *real_word, PAGE_RES_IT *pr_it, | |
| 413 std::vector<bool> *word_wanted, | |
| 414 std::vector<bool> *overlapped_any_blob, | |
| 415 std::vector<C_BLOB *> *target_blobs); | |
| 416 // Attempts to assign non-overlapping outlines to their nearest blobs or | |
| 417 // make new blobs out of them. | |
| 418 void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass, | |
| 419 WERD *real_word, PAGE_RES_IT *pr_it, | |
| 420 std::vector<bool> *word_wanted, | |
| 421 std::vector<C_BLOB *> *target_blobs); | |
| 422 // Starting with ok_outlines set to indicate which outlines overlap the blob, | |
| 423 // chooses the optimal set (approximately) and returns true if any outlines | |
| 424 // are desired, in which case ok_outlines indicates which ones. | |
| 425 bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, | |
| 426 C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines, | |
| 427 int num_outlines, std::vector<bool> *ok_outlines); | |
| 428 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes | |
| 429 // the inclusion of the outlines, and returns the certainty of the raw choice. | |
| 430 float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines, | |
| 431 const std::vector<C_OUTLINE *> &outlines, int pass_n, | |
| 432 PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str); | |
| 433 // Classifies the given blob (part of word_data->word->word) as an individual | |
| 434 // word, using languages, chopper etc, returning only the certainty of the | |
| 435 // best raw choice, and undoing all the work done to fake out the word. | |
| 436 float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, | |
| 437 float *c2); | |
| 438 void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data); | |
| 439 void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, | |
| 440 PointerVector<WERD_RES> *out_words); | |
| 441 void recog_pseudo_word(PAGE_RES *page_res, // blocks to check | |
| 442 TBOX &selection_box); | |
| 443 | |
| 444 void fix_rep_char(PAGE_RES_IT *page_res_it); | |
| 445 | |
| 446 ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, | |
| 447 const char *lengths); | |
| 448 void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block); | |
| 449 void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, | |
| 450 PointerVector<WERD_RES> *out_words); | |
| 451 void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word); | |
| 452 bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row); | |
| 453 bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row); | |
| 454 // Runs recognition with the test baseline shift and x-height and returns true | |
| 455 // if there was an improvement in recognition result. | |
| 456 bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, | |
| 457 WERD_RES *word, BLOCK *block, ROW *row); | |
| 458 bool recog_interactive(PAGE_RES_IT *pr_it); | |
| 459 | |
| 460 // Set fonts of this word. | |
| 461 void set_word_fonts(WERD_RES *word); | |
| 462 void font_recognition_pass(PAGE_RES *page_res); | |
| 463 void dictionary_correction_pass(PAGE_RES *page_res); | |
| 464 bool check_debug_pt(WERD_RES *word, int location); | |
| 465 | |
| 466 //// superscript.cpp //////////////////////////////////////////////////// | |
| 467 bool SubAndSuperscriptFix(WERD_RES *word_res); | |
| 468 void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, | |
| 469 ScriptPos *leading_pos, float *leading_certainty, | |
| 470 int *num_rebuilt_trailing, ScriptPos *trailing_pos, | |
| 471 float *trailing_certainty, float *avg_certainty, | |
| 472 float *unlikely_threshold); | |
| 473 WERD_RES *TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, | |
| 474 ScriptPos leading_pos, int num_chopped_trailing, | |
| 475 float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, | |
| 476 bool *is_good, int *retry_leading, int *retry_trailing); | |
| 477 bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, | |
| 478 int *left_ok, int *right_ok) const; | |
| 479 | |
| 480 //// output.h ////////////////////////////////////////////////////////// | |
| 481 | |
| 482 void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box); | |
| 483 void write_results(PAGE_RES_IT &page_res_it, // full info | |
| 484 char newline_type, // type of newline | |
| 485 bool force_eol // override tilde crunch? | |
| 486 ); | |
| 487 void set_unlv_suspects(WERD_RES *word); | |
| 488 UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated? | |
| 489 bool acceptable_number_string(const char *s, const char *lengths); | |
| 490 int16_t count_alphanums(const WERD_CHOICE &word); | |
| 491 int16_t count_alphas(const WERD_CHOICE &word); | |
| 492 | |
| 493 void read_config_file(const char *filename, SetParamConstraint constraint); | |
| 494 // Initialize for potentially a set of languages defined by the language | |
| 495 // string and recursively any additional languages required by any language | |
| 496 // traineddata file (via tessedit_load_sublangs in its config) that is loaded. | |
| 497 // See init_tesseract_internal for args. | |
| 498 int init_tesseract(const std::string &arg0, const std::string &textbase, | |
| 499 const std::string &language, OcrEngineMode oem, char **configs, | |
| 500 int configs_size, const std::vector<std::string> *vars_vec, | |
| 501 const std::vector<std::string> *vars_values, bool set_only_non_debug_params, | |
| 502 TessdataManager *mgr); | |
| 503 int init_tesseract(const std::string &datapath, const std::string &language, OcrEngineMode oem) { | |
| 504 TessdataManager mgr; | |
| 505 return init_tesseract(datapath, {}, language, oem, nullptr, 0, nullptr, nullptr, false, &mgr); | |
| 506 } | |
| 507 // Common initialization for a single language. | |
| 508 // arg0 is the datapath for the tessdata directory, which could be the | |
| 509 // path of the tessdata directory with no trailing /, or (if tessdata | |
| 510 // lives in the same directory as the executable, the path of the executable, | |
| 511 // hence the name arg0. | |
| 512 // textbase is an optional output file basename (used only for training) | |
| 513 // language is the language code to load. | |
| 514 // oem controls which engine(s) will operate on the image | |
| 515 // configs (argv) is an array of config filenames to load variables from. | |
| 516 // May be nullptr. | |
| 517 // configs_size (argc) is the number of elements in configs. | |
| 518 // vars_vec is an optional vector of variables to set. | |
| 519 // vars_values is an optional corresponding vector of values for the variables | |
| 520 // in vars_vec. | |
| 521 // If set_only_non_debug_params is true, only params that do not contain | |
| 522 // "debug" in the name will be set. | |
| 523 int init_tesseract_internal(const std::string &arg0, const std::string &textbase, | |
| 524 const std::string &language, OcrEngineMode oem, char **configs, | |
| 525 int configs_size, const std::vector<std::string> *vars_vec, | |
| 526 const std::vector<std::string> *vars_values, | |
| 527 bool set_only_non_debug_params, TessdataManager *mgr); | |
| 528 | |
| 529 // Set the universal_id member of each font to be unique among all | |
| 530 // instances of the same font loaded. | |
| 531 void SetupUniversalFontIds(); | |
| 532 | |
| 533 void recognize_page(std::string &image_name); | |
| 534 void end_tesseract(); | |
| 535 | |
| 536 bool init_tesseract_lang_data(const std::string &arg0, | |
| 537 const std::string &language, OcrEngineMode oem, char **configs, | |
| 538 int configs_size, const std::vector<std::string> *vars_vec, | |
| 539 const std::vector<std::string> *vars_values, | |
| 540 bool set_only_non_debug_params, TessdataManager *mgr); | |
| 541 | |
| 542 void ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load, | |
| 543 std::vector<std::string> *not_to_load); | |
| 544 | |
| 545 //// pgedit.h ////////////////////////////////////////////////////////// | |
| 546 SVMenuNode *build_menu_new(); | |
| 547 #ifndef GRAPHICS_DISABLED | |
| 548 void pgeditor_main(int width, int height, PAGE_RES *page_res); | |
| 549 | |
| 550 void process_image_event( // action in image win | |
| 551 const SVEvent &event); | |
| 552 bool process_cmd_win_event( // UI command semantics | |
| 553 int32_t cmd_event, // which menu item? | |
| 554 char *new_value // any prompt data | |
| 555 ); | |
| 556 #endif // !GRAPHICS_DISABLED | |
| 557 void debug_word(PAGE_RES *page_res, const TBOX &selection_box); | |
| 558 void do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)); | |
| 559 bool word_display(PAGE_RES_IT *pr_it); | |
| 560 bool word_bln_display(PAGE_RES_IT *pr_it); | |
| 561 bool word_blank_and_set_display(PAGE_RES_IT *pr_its); | |
| 562 bool word_set_display(PAGE_RES_IT *pr_it); | |
| 563 // #ifndef GRAPHICS_DISABLED | |
| 564 bool word_dumper(PAGE_RES_IT *pr_it); | |
| 565 // #endif // !GRAPHICS_DISABLED | |
| 566 void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box); | |
| 567 //// reject.h ////////////////////////////////////////////////////////// | |
| 568 // make rej map for word | |
| 569 void make_reject_map(WERD_RES *word, ROW *row, int16_t pass); | |
| 570 bool one_ell_conflict(WERD_RES *word_res, bool update_map); | |
| 571 int16_t first_alphanum_index(const char *word, const char *word_lengths); | |
| 572 int16_t first_alphanum_offset(const char *word, const char *word_lengths); | |
| 573 int16_t alpha_count(const char *word, const char *word_lengths); | |
| 574 bool word_contains_non_1_digit(const char *word, const char *word_lengths); | |
| 575 void dont_allow_1Il(WERD_RES *word); | |
| 576 int16_t count_alphanums( // how many alphanums | |
| 577 WERD_RES *word); | |
| 578 void flip_0O(WERD_RES *word); | |
| 579 bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id); | |
| 580 bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id); | |
| 581 bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row); | |
| 582 void nn_match_word( // Match a word | |
| 583 WERD_RES *word, ROW *row); | |
| 584 void nn_recover_rejects(WERD_RES *word, ROW *row); | |
| 585 void set_done( // set done flag | |
| 586 WERD_RES *word, int16_t pass); | |
| 587 int16_t safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict? | |
| 588 void flip_hyphens(WERD_RES *word); | |
| 589 void reject_I_1_L(WERD_RES *word); | |
| 590 void reject_edge_blobs(WERD_RES *word); | |
| 591 void reject_mostly_rejects(WERD_RES *word); | |
| 592 //// adaptions.h /////////////////////////////////////////////////////// | |
| 593 bool word_adaptable( // should we adapt? | |
| 594 WERD_RES *word, uint16_t mode); | |
| 595 | |
| 596 //// tfacepp.cpp /////////////////////////////////////////////////////// | |
| 597 void recog_word_recursive(WERD_RES *word); | |
| 598 void recog_word(WERD_RES *word); | |
| 599 void split_and_recog_word(WERD_RES *word); | |
| 600 void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, | |
| 601 BlamerBundle **orig_blamer_bundle) const; | |
| 602 void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const; | |
| 603 //// fixspace.cpp /////////////////////////////////////////////////////// | |
| 604 bool digit_or_numeric_punct(WERD_RES *word, int char_position); | |
| 605 int16_t eval_word_spacing(WERD_RES_LIST &word_res_list); | |
| 606 void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block); | |
| 607 int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list); | |
| 608 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block); | |
| 609 void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block); | |
| 610 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block); | |
| 611 void fix_fuzzy_spaces( // find fuzzy words | |
| 612 ETEXT_DESC *monitor, // progress monitor | |
| 613 int32_t word_count, // count of words in doc | |
| 614 PAGE_RES *page_res); | |
| 615 void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved); | |
| 616 bool fixspace_thinks_word_done(WERD_RES *word); | |
| 617 int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score); | |
| 618 float blob_noise_score(TBLOB *blob); | |
| 619 void break_noisiest_blob_word(WERD_RES_LIST &words); | |
| 620 //// docqual.cpp //////////////////////////////////////////////////////// | |
| 621 #ifndef DISABLED_LEGACY_ENGINE | |
| 622 GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word); | |
| 623 bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word); | |
| 624 #endif | |
| 625 void tilde_crunch(PAGE_RES_IT &page_res_it); | |
| 626 void unrej_good_quality_words( // unreject potential | |
| 627 PAGE_RES_IT &page_res_it); | |
| 628 void doc_and_block_rejection( // reject big chunks | |
| 629 PAGE_RES_IT &page_res_it, bool good_quality_doc); | |
| 630 void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc); | |
| 631 void convert_bad_unlv_chs(WERD_RES *word_res); | |
| 632 void tilde_delete(PAGE_RES_IT &page_res_it); | |
| 633 int16_t word_blob_quality(WERD_RES *word); | |
| 634 void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count); | |
| 635 void unrej_good_chs(WERD_RES *word); | |
| 636 int16_t count_outline_errs(char c, int16_t outline_count); | |
| 637 int16_t word_outline_errs(WERD_RES *word); | |
| 638 #ifndef DISABLED_LEGACY_ENGINE | |
| 639 bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level); | |
| 640 #endif | |
| 641 CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode); | |
| 642 int16_t failure_count(WERD_RES *word); | |
| 643 bool noise_outlines(TWERD *word); | |
| 644 //// pagewalk.cpp /////////////////////////////////////////////////////// | |
| 645 void process_selected_words(PAGE_RES *page_res, // blocks to check | |
| 646 // function to call | |
| 647 TBOX &selection_box, | |
| 648 bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)); | |
| 649 //// tessbox.cpp /////////////////////////////////////////////////////// | |
| 650 void tess_add_doc_word( // test acceptability | |
| 651 WERD_CHOICE *word_choice // after context | |
| 652 ); | |
| 653 void tess_segment_pass_n(int pass_n, WERD_RES *word); | |
| 654 bool tess_acceptable_word(WERD_RES *word); | |
| 655 | |
| 656 //// applybox.cpp ////////////////////////////////////////////////////// | |
| 657 // Applies the box file based on the image name filename, and resegments | |
| 658 // the words in the block_list (page), with: | |
| 659 // blob-mode: one blob per line in the box file, words as input. | |
| 660 // word/line-mode: one blob per space-delimited unit after the #, and one word | |
| 661 // per line in the box file. (See comment above for box file format.) | |
| 662 // If find_segmentation is true, (word/line mode) then the classifier is used | |
| 663 // to re-segment words/lines to match the space-delimited truth string for | |
| 664 // each box. In this case, the input box may be for a word or even a whole | |
| 665 // text line, and the output words will contain multiple blobs corresponding | |
| 666 // to the space-delimited input string. | |
| 667 // With find_segmentation false, no classifier is needed, but the chopper | |
| 668 // can still be used to correctly segment touching characters with the help | |
| 669 // of the input boxes. | |
| 670 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned | |
| 671 // from normal classification, ie. with a word, chopped_word, rebuild_word, | |
| 672 // seam_array, denorm, box_word, and best_state, but NO best_choice or | |
| 673 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. | |
| 674 // Instead, the correct_text member of WERD_RES is set, and this may be later | |
| 675 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords | |
| 676 // is not required before calling ApplyBoxTraining. | |
| 677 PAGE_RES *ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list); | |
| 678 | |
| 679 // Any row xheight that is significantly different from the median is set | |
| 680 // to the median. | |
| 681 void PreenXHeights(BLOCK_LIST *block_list); | |
| 682 | |
| 683 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: | |
| 684 // All fuzzy spaces are removed, and all the words are maximally chopped. | |
| 685 PAGE_RES *SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list); | |
| 686 // Tests the chopper by exhaustively running chop_one_blob. | |
| 687 // The word_res will contain filled chopped_word, seam_array, denorm, | |
| 688 // box_word and best_state for the maximally chopped word. | |
| 689 void MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row, | |
| 690 WERD_RES *word_res); | |
| 691 // Gather consecutive blobs that match the given box into the best_state | |
| 692 // and corresponding correct_text. | |
| 693 // Fights over which box owns which blobs are settled by pre-chopping and | |
| 694 // applying the blobs to box or next_box with the least non-overlap. | |
| 695 // Returns false if the box was in error, which can only be caused by | |
| 696 // failing to find an appropriate blob for a box. | |
| 697 // This means that occasionally, blobs may be incorrectly segmented if the | |
| 698 // chopper fails to find a suitable chop point. | |
| 699 bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, | |
| 700 const TBOX *next_box, const char *correct_text); | |
| 701 // Consume all source blobs that strongly overlap the given box, | |
| 702 // putting them into a new word, with the correct_text label. | |
| 703 // Fights over which box owns which blobs are settled by | |
| 704 // applying the blobs to box or next_box with the least non-overlap. | |
| 705 // Returns false if the box was in error, which can only be caused by | |
| 706 // failing to find an overlapping blob for a box. | |
| 707 bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, | |
| 708 const char *correct_text); | |
| 709 // Resegments the words by running the classifier in an attempt to find the | |
| 710 // correct segmentation that produces the required string. | |
| 711 void ReSegmentByClassification(PAGE_RES *page_res); | |
| 712 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. | |
| 713 // Returns false if an invalid UNICHAR_ID is encountered. | |
| 714 bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids); | |
| 715 // Resegments the word to achieve the target_text from the classifier. | |
| 716 // Returns false if the re-segmentation fails. | |
| 717 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and | |
| 718 // applies a full search on the classifier results to find the best classified | |
| 719 // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity | |
| 720 // substitutions ARE used. | |
| 721 bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res); | |
| 722 // Recursive helper to find a match to the target_text (from text_index | |
| 723 // position) in the choices (from choices_pos position). | |
| 724 // Choices is an array of vectors of length choices_length, with each | |
| 725 // element representing a starting position in the word, and the | |
| 726 // vector holding classification results for a sequence of consecutive | |
| 727 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. | |
| 728 void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos, | |
| 729 unsigned choices_length, const std::vector<UNICHAR_ID> &target_text, | |
| 730 unsigned text_index, float rating, std::vector<int> *segmentation, | |
| 731 float *best_rating, std::vector<int> *best_segmentation); | |
| 732 // Counts up the labelled words and the blobs within. | |
| 733 // Deletes all unused or emptied words, counting the unused ones. | |
| 734 // Resets W_BOL and W_EOL flags correctly. | |
| 735 // Builds the rebuild_word and rebuilds the box_word. | |
| 736 void TidyUp(PAGE_RES *page_res); | |
| 737 // Logs a bad box by line in the box file and box coords. | |
| 738 void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg); | |
| 739 // Creates a fake best_choice entry in each WERD_RES with the correct text. | |
| 740 void CorrectClassifyWords(PAGE_RES *page_res); | |
| 741 // Call LearnWord to extract features for labelled blobs within each word. | |
| 742 // Features are stored in an internal buffer. | |
| 743 void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res); | |
| 744 | |
| 745 //// fixxht.cpp /////////////////////////////////////////////////////// | |
| 746 // Returns the number of misfit blob tops in this word. | |
| 747 int CountMisfitTops(WERD_RES *word_res); | |
| 748 // Returns a new x-height in pixels (original image coords) that is | |
| 749 // maximally compatible with the result in word_res. | |
| 750 // Returns 0.0f if no x-height is found that is better than the current | |
| 751 // estimate. | |
| 752 float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift); | |
| 753 //// Data members /////////////////////////////////////////////////////// | |
| 754 // TODO(ocr-team): Find and remove obsolete parameters. | |
| 755 BOOL_VAR_H(tessedit_resegment_from_boxes); | |
| 756 BOOL_VAR_H(tessedit_resegment_from_line_boxes); | |
| 757 BOOL_VAR_H(tessedit_train_from_boxes); | |
| 758 BOOL_VAR_H(tessedit_make_boxes_from_boxes); | |
| 759 BOOL_VAR_H(tessedit_train_line_recognizer); | |
| 760 BOOL_VAR_H(tessedit_dump_pageseg_images); | |
| 761 // TODO: remove deprecated tessedit_do_invert in release 6. | |
| 762 BOOL_VAR_H(tessedit_do_invert); | |
| 763 double_VAR_H(invert_threshold); | |
| 764 INT_VAR_H(tessedit_pageseg_mode); | |
| 765 INT_VAR_H(thresholding_method); | |
| 766 BOOL_VAR_H(thresholding_debug); | |
| 767 double_VAR_H(thresholding_window_size); | |
| 768 double_VAR_H(thresholding_kfactor); | |
| 769 double_VAR_H(thresholding_tile_size); | |
| 770 double_VAR_H(thresholding_smooth_kernel_size); | |
| 771 double_VAR_H(thresholding_score_fraction); | |
| 772 INT_VAR_H(tessedit_ocr_engine_mode); | |
| 773 STRING_VAR_H(tessedit_char_blacklist); | |
| 774 STRING_VAR_H(tessedit_char_whitelist); | |
| 775 STRING_VAR_H(tessedit_char_unblacklist); | |
| 776 BOOL_VAR_H(tessedit_ambigs_training); | |
| 777 INT_VAR_H(pageseg_devanagari_split_strategy); | |
| 778 INT_VAR_H(ocr_devanagari_split_strategy); | |
| 779 STRING_VAR_H(tessedit_write_params_to_file); | |
| 780 BOOL_VAR_H(tessedit_adaption_debug); | |
| 781 INT_VAR_H(bidi_debug); | |
| 782 INT_VAR_H(applybox_debug); | |
| 783 INT_VAR_H(applybox_page); | |
| 784 STRING_VAR_H(applybox_exposure_pattern); | |
| 785 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode); | |
| 786 BOOL_VAR_H(applybox_learn_ngrams_mode); | |
| 787 BOOL_VAR_H(tessedit_display_outwords); | |
| 788 BOOL_VAR_H(tessedit_dump_choices); | |
| 789 BOOL_VAR_H(tessedit_timing_debug); | |
| 790 BOOL_VAR_H(tessedit_fix_fuzzy_spaces); | |
| 791 BOOL_VAR_H(tessedit_unrej_any_wd); | |
| 792 BOOL_VAR_H(tessedit_fix_hyphens); | |
| 793 BOOL_VAR_H(tessedit_enable_doc_dict); | |
| 794 BOOL_VAR_H(tessedit_debug_fonts); | |
| 795 INT_VAR_H(tessedit_font_id); | |
| 796 BOOL_VAR_H(tessedit_debug_block_rejection); | |
| 797 BOOL_VAR_H(tessedit_enable_bigram_correction); | |
| 798 BOOL_VAR_H(tessedit_enable_dict_correction); | |
| 799 INT_VAR_H(tessedit_bigram_debug); | |
| 800 BOOL_VAR_H(enable_noise_removal); | |
| 801 INT_VAR_H(debug_noise_removal); | |
| 802 // Worst (min) certainty, for which a diacritic is allowed to make the base | |
| 803 // character worse and still be included. | |
| 804 double_VAR_H(noise_cert_basechar); | |
| 805 // Worst (min) certainty, for which a non-overlapping diacritic is allowed to | |
| 806 // make the base character worse and still be included. | |
| 807 double_VAR_H(noise_cert_disjoint); | |
| 808 // Worst (min) certainty, for which a diacritic is allowed to make a new | |
| 809 // stand-alone blob. | |
| 810 double_VAR_H(noise_cert_punc); | |
| 811 // Factor of certainty margin for adding diacritics to not count as worse. | |
| 812 double_VAR_H(noise_cert_factor); | |
| 813 INT_VAR_H(noise_maxperblob); | |
| 814 INT_VAR_H(noise_maxperword); | |
| 815 INT_VAR_H(debug_x_ht_level); | |
| 816 STRING_VAR_H(chs_leading_punct); | |
| 817 STRING_VAR_H(chs_trailing_punct1); | |
| 818 STRING_VAR_H(chs_trailing_punct2); | |
| 819 double_VAR_H(quality_rej_pc); | |
| 820 double_VAR_H(quality_blob_pc); | |
| 821 double_VAR_H(quality_outline_pc); | |
| 822 double_VAR_H(quality_char_pc); | |
| 823 INT_VAR_H(quality_min_initial_alphas_reqd); | |
| 824 INT_VAR_H(tessedit_tess_adaption_mode); | |
| 825 BOOL_VAR_H(tessedit_minimal_rej_pass1); | |
| 826 BOOL_VAR_H(tessedit_test_adaption); | |
| 827 BOOL_VAR_H(test_pt); | |
| 828 double_VAR_H(test_pt_x); | |
| 829 double_VAR_H(test_pt_y); | |
| 830 INT_VAR_H(multilang_debug_level); | |
| 831 INT_VAR_H(paragraph_debug_level); | |
| 832 BOOL_VAR_H(paragraph_text_based); | |
| 833 BOOL_VAR_H(lstm_use_matrix); | |
| 834 STRING_VAR_H(outlines_odd); | |
| 835 STRING_VAR_H(outlines_2); | |
| 836 BOOL_VAR_H(tessedit_good_quality_unrej); | |
| 837 BOOL_VAR_H(tessedit_use_reject_spaces); | |
| 838 double_VAR_H(tessedit_reject_doc_percent); | |
| 839 double_VAR_H(tessedit_reject_block_percent); | |
| 840 double_VAR_H(tessedit_reject_row_percent); | |
| 841 double_VAR_H(tessedit_whole_wd_rej_row_percent); | |
| 842 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds); | |
| 843 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds); | |
| 844 BOOL_VAR_H(tessedit_dont_blkrej_good_wds); | |
| 845 BOOL_VAR_H(tessedit_dont_rowrej_good_wds); | |
| 846 INT_VAR_H(tessedit_preserve_min_wd_len); | |
| 847 BOOL_VAR_H(tessedit_row_rej_good_docs); | |
| 848 double_VAR_H(tessedit_good_doc_still_rowrej_wd); | |
| 849 BOOL_VAR_H(tessedit_reject_bad_qual_wds); | |
| 850 BOOL_VAR_H(tessedit_debug_doc_rejection); | |
| 851 BOOL_VAR_H(tessedit_debug_quality_metrics); | |
| 852 BOOL_VAR_H(bland_unrej); | |
| 853 double_VAR_H(quality_rowrej_pc); | |
| 854 BOOL_VAR_H(unlv_tilde_crunching); | |
| 855 BOOL_VAR_H(hocr_font_info); | |
| 856 BOOL_VAR_H(hocr_char_boxes); | |
| 857 BOOL_VAR_H(crunch_early_merge_tess_fails); | |
| 858 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs); | |
| 859 double_VAR_H(crunch_terrible_rating); | |
| 860 BOOL_VAR_H(crunch_terrible_garbage); | |
| 861 double_VAR_H(crunch_poor_garbage_cert); | |
| 862 double_VAR_H(crunch_poor_garbage_rate); | |
| 863 double_VAR_H(crunch_pot_poor_rate); | |
| 864 double_VAR_H(crunch_pot_poor_cert); | |
| 865 double_VAR_H(crunch_del_rating); | |
| 866 double_VAR_H(crunch_del_cert); | |
| 867 double_VAR_H(crunch_del_min_ht); | |
| 868 double_VAR_H(crunch_del_max_ht); | |
| 869 double_VAR_H(crunch_del_min_width); | |
| 870 double_VAR_H(crunch_del_high_word); | |
| 871 double_VAR_H(crunch_del_low_word); | |
| 872 double_VAR_H(crunch_small_outlines_size); | |
| 873 INT_VAR_H(crunch_rating_max); | |
| 874 INT_VAR_H(crunch_pot_indicators); | |
| 875 BOOL_VAR_H(crunch_leave_ok_strings); | |
| 876 BOOL_VAR_H(crunch_accept_ok); | |
| 877 BOOL_VAR_H(crunch_leave_accept_strings); | |
| 878 BOOL_VAR_H(crunch_include_numerals); | |
| 879 INT_VAR_H(crunch_leave_lc_strings); | |
| 880 INT_VAR_H(crunch_leave_uc_strings); | |
| 881 INT_VAR_H(crunch_long_repetitions); | |
| 882 INT_VAR_H(crunch_debug); | |
| 883 INT_VAR_H(fixsp_non_noise_limit); | |
| 884 double_VAR_H(fixsp_small_outlines_size); | |
| 885 BOOL_VAR_H(tessedit_prefer_joined_punct); | |
| 886 INT_VAR_H(fixsp_done_mode); | |
| 887 INT_VAR_H(debug_fix_space_level); | |
| 888 STRING_VAR_H(numeric_punctuation); | |
| 889 INT_VAR_H(x_ht_acceptance_tolerance); | |
| 890 INT_VAR_H(x_ht_min_change); | |
| 891 INT_VAR_H(superscript_debug); | |
| 892 double_VAR_H(superscript_worse_certainty); | |
| 893 double_VAR_H(superscript_bettered_certainty); | |
| 894 double_VAR_H(superscript_scaledown_ratio); | |
| 895 double_VAR_H(subscript_max_y_top); | |
| 896 double_VAR_H(superscript_min_y_bottom); | |
| 897 BOOL_VAR_H(tessedit_write_block_separators); | |
| 898 BOOL_VAR_H(tessedit_write_rep_codes); | |
| 899 BOOL_VAR_H(tessedit_write_unlv); | |
| 900 BOOL_VAR_H(tessedit_create_txt); | |
| 901 BOOL_VAR_H(tessedit_create_hocr); | |
| 902 BOOL_VAR_H(tessedit_create_alto); | |
| 903 BOOL_VAR_H(tessedit_create_page_xml); | |
| 904 BOOL_VAR_H(page_xml_polygon); | |
| 905 INT_VAR_H(page_xml_level); | |
| 906 BOOL_VAR_H(tessedit_create_lstmbox); | |
| 907 BOOL_VAR_H(tessedit_create_tsv); | |
| 908 BOOL_VAR_H(tessedit_create_wordstrbox); | |
| 909 BOOL_VAR_H(tessedit_create_pdf); | |
| 910 BOOL_VAR_H(textonly_pdf); | |
| 911 INT_VAR_H(jpg_quality); | |
| 912 INT_VAR_H(user_defined_dpi); | |
| 913 INT_VAR_H(min_characters_to_try); | |
| 914 STRING_VAR_H(unrecognised_char); | |
| 915 INT_VAR_H(suspect_level); | |
| 916 INT_VAR_H(suspect_short_words); | |
| 917 BOOL_VAR_H(suspect_constrain_1Il); | |
| 918 double_VAR_H(suspect_rating_per_ch); | |
| 919 double_VAR_H(suspect_accept_rating); | |
| 920 BOOL_VAR_H(tessedit_minimal_rejection); | |
| 921 BOOL_VAR_H(tessedit_zero_rejection); | |
| 922 BOOL_VAR_H(tessedit_word_for_word); | |
| 923 BOOL_VAR_H(tessedit_zero_kelvin_rejection); | |
| 924 INT_VAR_H(tessedit_reject_mode); | |
| 925 BOOL_VAR_H(tessedit_rejection_debug); | |
| 926 BOOL_VAR_H(tessedit_flip_0O); | |
| 927 double_VAR_H(tessedit_lower_flip_hyphen); | |
| 928 double_VAR_H(tessedit_upper_flip_hyphen); | |
| 929 BOOL_VAR_H(rej_trust_doc_dawg); | |
| 930 BOOL_VAR_H(rej_1Il_use_dict_word); | |
| 931 BOOL_VAR_H(rej_1Il_trust_permuter_type); | |
| 932 BOOL_VAR_H(rej_use_tess_accepted); | |
| 933 BOOL_VAR_H(rej_use_tess_blanks); | |
| 934 BOOL_VAR_H(rej_use_good_perm); | |
| 935 BOOL_VAR_H(rej_use_sensible_wd); | |
| 936 BOOL_VAR_H(rej_alphas_in_number_perm); | |
| 937 double_VAR_H(rej_whole_of_mostly_reject_word_fract); | |
| 938 INT_VAR_H(tessedit_image_border); | |
| 939 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds); | |
| 940 STRING_VAR_H(conflict_set_I_l_1); | |
| 941 INT_VAR_H(min_sane_x_ht_pixels); | |
| 942 BOOL_VAR_H(tessedit_create_boxfile); | |
| 943 INT_VAR_H(tessedit_page_number); | |
| 944 BOOL_VAR_H(tessedit_write_images); | |
| 945 BOOL_VAR_H(interactive_display_mode); | |
| 946 STRING_VAR_H(file_type); | |
| 947 BOOL_VAR_H(tessedit_override_permuter); | |
| 948 STRING_VAR_H(tessedit_load_sublangs); | |
| 949 BOOL_VAR_H(tessedit_use_primary_params_model); | |
| 950 // Min acceptable orientation margin (difference in scores between top and 2nd | |
| 951 // choice in OSResults::orientations) to believe the page orientation. | |
| 952 double_VAR_H(min_orientation_margin); | |
| 953 BOOL_VAR_H(textord_tabfind_show_vlines); | |
| 954 BOOL_VAR_H(textord_use_cjk_fp_model); | |
| 955 BOOL_VAR_H(poly_allow_detailed_fx); | |
| 956 BOOL_VAR_H(tessedit_init_config_only); | |
| 957 #ifndef DISABLED_LEGACY_ENGINE | |
| 958 BOOL_VAR_H(textord_equation_detect); | |
| 959 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 960 BOOL_VAR_H(textord_tabfind_vertical_text); | |
| 961 BOOL_VAR_H(textord_tabfind_force_vertical_text); | |
| 962 double_VAR_H(textord_tabfind_vertical_text_ratio); | |
| 963 double_VAR_H(textord_tabfind_aligned_gap_fraction); | |
| 964 INT_VAR_H(tessedit_parallelize); | |
| 965 BOOL_VAR_H(preserve_interword_spaces); | |
| 966 STRING_VAR_H(page_separator); | |
| 967 INT_VAR_H(lstm_choice_mode); | |
| 968 INT_VAR_H(lstm_choice_iterations); | |
| 969 double_VAR_H(lstm_rating_coefficient); | |
| 970 BOOL_VAR_H(pageseg_apply_music_mask); | |
| 971 | |
| 972 //// ambigsrecog.cpp ///////////////////////////////////////////////////////// | |
| 973 FILE *init_recog_training(const char *filename); | |
| 974 void recog_training_segmented(const char *filename, PAGE_RES *page_res, | |
| 975 volatile ETEXT_DESC *monitor, FILE *output_file); | |
| 976 void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file); | |
| 977 | |
| 978 private: | |
| 979 // The filename of a backup config file. If not null, then we currently | |
| 980 // have a temporary debug config file loaded, and backup_config_file_ | |
| 981 // will be loaded, and set to null when debug is complete. | |
| 982 const char *backup_config_file_; | |
| 983 // The filename of a config file to read when processing a debug word. | |
| 984 std::string word_config_; | |
| 985 // Image used for input to layout analysis and tesseract recognition. | |
| 986 // May be modified by the ShiroRekhaSplitter to eliminate the top-line. | |
| 987 Image pix_binary_; | |
| 988 // Grey-level input image if the input was not binary, otherwise nullptr. | |
| 989 Image pix_grey_; | |
| 990 // Original input image. Color if the input was color. | |
| 991 Image pix_original_; | |
| 992 // Thresholds that were used to generate the thresholded image from grey. | |
| 993 Image pix_thresholds_; | |
| 994 // Debug images. If non-empty, will be written on destruction. | |
| 995 DebugPixa pixa_debug_; | |
| 996 // Input image resolution after any scaling. The resolution is not well | |
| 997 // transmitted by operations on Pix, so we keep an independent record here. | |
| 998 int source_resolution_; | |
| 999 // The shiro-rekha splitter object which is used to split top-lines in | |
| 1000 // Devanagari words to provide a better word and grapheme segmentation. | |
| 1001 ShiroRekhaSplitter splitter_; | |
| 1002 // Page segmentation/layout | |
| 1003 Textord textord_; | |
| 1004 // True if the primary language uses right_to_left reading order. | |
| 1005 bool right_to_left_; | |
| 1006 Image scaled_color_; | |
| 1007 int scaled_factor_; | |
| 1008 FCOORD deskew_; | |
| 1009 FCOORD reskew_; | |
| 1010 float gradient_; | |
| 1011 TesseractStats stats_; | |
| 1012 // Sub-languages to be tried in addition to this. | |
| 1013 std::vector<Tesseract *> sub_langs_; | |
| 1014 // Most recently used Tesseract out of this and sub_langs_. The default | |
| 1015 // language for the next word. | |
| 1016 Tesseract *most_recently_used_; | |
| 1017 // The size of the font table, ie max possible font id + 1. | |
| 1018 int font_table_size_; | |
| 1019 #ifndef DISABLED_LEGACY_ENGINE | |
| 1020 // Equation detector. Note: this pointer is NOT owned by the class. | |
| 1021 EquationDetect *equ_detect_; | |
| 1022 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1023 // LSTM recognizer, if available. | |
| 1024 LSTMRecognizer *lstm_recognizer_; | |
| 1025 // Output "page" number (actually line number) using TrainLineRecognizer. | |
| 1026 int train_line_page_num_; | |
| 1027 }; | |
| 1028 | |
| 1029 } // namespace tesseract | |
| 1030 | |
| 1031 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_ |
