Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: pageres.h (Formerly page_res.h) | |
| 3 * Description: Results classes used by control.c | |
| 4 * Author: Phil Cheatle | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #ifndef PAGERES_H | |
| 20 #define PAGERES_H | |
| 21 | |
| 22 #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS | |
| 23 #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH | |
| 24 #include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH | |
| 25 #include "genericvector.h" // for PointerVector | |
| 26 #include "matrix.h" // for MATRIX | |
| 27 #include "normalis.h" // for DENORM | |
| 28 #include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only) | |
| 29 #include "rect.h" // for TBOX | |
| 30 #include "rejctmap.h" // for REJMAP | |
| 31 #include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI... | |
| 32 #include "werd.h" // for WERD, W_BOL, W_EOL | |
| 33 | |
| 34 #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID | |
| 35 | |
| 36 #include <cstdint> // for int32_t, int16_t | |
| 37 #include <functional> // for std::function | |
| 38 #include <set> // for std::pair | |
| 39 #include <vector> // for std::vector | |
| 40 | |
| 41 #include <sys/types.h> // for int8_t | |
| 42 | |
| 43 struct Pix; | |
| 44 | |
| 45 namespace tesseract { | |
| 46 | |
| 47 class BLOCK; | |
| 48 class BLOCK_LIST; | |
| 49 class BLOCK_RES; | |
| 50 class ROW; | |
| 51 class ROW_RES; | |
| 52 class SEAM; | |
| 53 class WERD_RES; | |
| 54 | |
| 55 struct TWERD; | |
| 56 | |
| 57 class BoxWord; | |
| 58 class Tesseract; | |
| 59 struct FontInfo; | |
| 60 | |
| 61 /* Forward declarations */ | |
| 62 | |
| 63 class BLOCK_RES; | |
| 64 | |
| 65 ELISTIZEH(BLOCK_RES) | |
| 66 CLISTIZEH(BLOCK_RES) | |
| 67 class ROW_RES; | |
| 68 | |
| 69 ELISTIZEH(ROW_RES) | |
| 70 class WERD_RES; | |
| 71 | |
| 72 ELISTIZEH(WERD_RES) | |
| 73 | |
| 74 /************************************************************************* | |
| 75 * PAGE_RES - Page results | |
| 76 *************************************************************************/ | |
| 77 class PAGE_RES { // page result | |
| 78 public: | |
| 79 int32_t char_count; | |
| 80 int32_t rej_count; | |
| 81 BLOCK_RES_LIST block_res_list; | |
| 82 bool rejected; | |
| 83 // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to | |
| 84 // the next word. This pointer is not owned by PAGE_RES class. | |
| 85 WERD_CHOICE **prev_word_best_choice; | |
| 86 // Sums of blame reasons computed by the blamer. | |
| 87 std::vector<int> blame_reasons; | |
| 88 // Debug information about all the misadaptions on this page. | |
| 89 // Each BlamerBundle contains an index into this vector, so that words that | |
| 90 // caused misadaption could be marked. However, since words could be | |
| 91 // deleted/split/merged, the log is stored on the PAGE_RES level. | |
| 92 std::vector<std::string> misadaption_log; | |
| 93 | |
| 94 inline void Init() { | |
| 95 char_count = 0; | |
| 96 rej_count = 0; | |
| 97 rejected = false; | |
| 98 prev_word_best_choice = nullptr; | |
| 99 blame_reasons.clear(); | |
| 100 blame_reasons.resize(IRR_NUM_REASONS); | |
| 101 } | |
| 102 | |
| 103 PAGE_RES() { | |
| 104 Init(); | |
| 105 } // empty constructor | |
| 106 | |
| 107 PAGE_RES(bool merge_similar_words, | |
| 108 BLOCK_LIST *block_list, // real blocks | |
| 109 WERD_CHOICE **prev_word_best_choice_ptr); | |
| 110 | |
| 111 ~PAGE_RES() = default; | |
| 112 }; | |
| 113 | |
| 114 /************************************************************************* | |
| 115 * BLOCK_RES - Block results | |
| 116 *************************************************************************/ | |
| 117 | |
| 118 class BLOCK_RES : public ELIST_LINK { | |
| 119 public: | |
| 120 BLOCK *block; // real block | |
| 121 int32_t char_count; // chars in block | |
| 122 int32_t rej_count; // rejected chars | |
| 123 int16_t font_class; // | |
| 124 int16_t row_count; | |
| 125 float x_height; | |
| 126 bool font_assigned; // block already | |
| 127 // processed | |
| 128 | |
| 129 ROW_RES_LIST row_res_list; | |
| 130 | |
| 131 BLOCK_RES() = default; | |
| 132 | |
| 133 BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block | |
| 134 | |
| 135 ~BLOCK_RES() = default; | |
| 136 }; | |
| 137 | |
| 138 /************************************************************************* | |
| 139 * ROW_RES - Row results | |
| 140 *************************************************************************/ | |
| 141 | |
| 142 class ROW_RES : public ELIST_LINK { | |
| 143 public: | |
| 144 ROW *row; // real row | |
| 145 int32_t char_count; // chars in block | |
| 146 int32_t rej_count; // rejected chars | |
| 147 int32_t whole_word_rej_count; // rejs in total rej wds | |
| 148 WERD_RES_LIST word_res_list; | |
| 149 | |
| 150 ROW_RES() = default; | |
| 151 | |
| 152 ROW_RES(bool merge_similar_words, ROW *the_row); // real row | |
| 153 | |
| 154 ~ROW_RES() = default; | |
| 155 }; | |
| 156 | |
| 157 /************************************************************************* | |
| 158 * WERD_RES - Word results | |
| 159 *************************************************************************/ | |
| 160 enum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE }; | |
| 161 | |
| 162 // WERD_RES is a collection of publicly accessible members that gathers | |
| 163 // information about a word result. | |
| 164 class TESS_API WERD_RES : public ELIST_LINK { | |
| 165 public: | |
| 166 // Which word is which? | |
| 167 // There are 3 coordinate spaces in use here: a possibly rotated pixel space, | |
| 168 // the original image coordinate space, and the BLN space in which the | |
| 169 // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight, | |
| 170 // and the x-middle of the word is at 0. | |
| 171 // In the rotated pixel space, coordinates correspond to the input image, | |
| 172 // but may be rotated about the origin by a multiple of 90 degrees, | |
| 173 // and may therefore be negative. | |
| 174 // In any case a rotation by denorm.block()->re_rotation() will take them | |
| 175 // back to the original image. | |
| 176 // The other differences between words all represent different stages of | |
| 177 // processing during recognition. | |
| 178 | |
| 179 // ---------------------------INPUT------------------------------------- | |
| 180 | |
| 181 // The word is the input C_BLOBs in the rotated pixel space. | |
| 182 // word is NOT owned by the WERD_RES unless combination is true. | |
| 183 // All the other word pointers ARE owned by the WERD_RES. | |
| 184 WERD *word = nullptr; // Input C_BLOB word. | |
| 185 | |
| 186 // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------ | |
| 187 | |
| 188 // The bln_boxes contains the bounding boxes (only) of the input word, in the | |
| 189 // BLN space. The lengths of word and bln_boxes | |
| 190 // match as they are both before any chopping. | |
| 191 // TODO(rays) determine if docqual does anything useful and delete bln_boxes | |
| 192 // if it doesn't. | |
| 193 tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes. | |
| 194 // The ROW that this word sits in. NOT owned by the WERD_RES. | |
| 195 ROW *blob_row = nullptr; | |
| 196 // The denorm provides the transformation to get back to the rotated image | |
| 197 // coords from the chopped_word/rebuild_word BLN coords, but each blob also | |
| 198 // has its own denorm. | |
| 199 DENORM denorm; // For use on chopped_word. | |
| 200 // Unicharset used by the classifier output in best_choice and raw_choice. | |
| 201 const UNICHARSET *uch_set = nullptr; // For converting back to utf8. | |
| 202 | |
| 203 // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION---- | |
| 204 // ----Setup to a (different!) state expected by the various classifiers---- | |
| 205 // TODO(rays) Tidy and make more consistent. | |
| 206 | |
| 207 // The chopped_word is also in BLN space, and represents the fully chopped | |
| 208 // character fragments that make up the word. | |
| 209 // The length of chopped_word matches length of seam_array + 1 (if set). | |
| 210 TWERD *chopped_word = nullptr; // BLN chopped fragments output. | |
| 211 // Vector of SEAM* holding chopping points matching chopped_word. | |
| 212 std::vector<SEAM *> seam_array; | |
| 213 // Widths of blobs in chopped_word. | |
| 214 std::vector<int> blob_widths; | |
| 215 // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between | |
| 216 // blob i and blob i+1. | |
| 217 std::vector<int> blob_gaps; | |
| 218 // Stores the lstm choices of every timestep | |
| 219 std::vector<std::vector<std::pair<const char *, float>>> timesteps; | |
| 220 // Stores the lstm choices of every timestep segmented by character | |
| 221 std::vector<std::vector<std::vector<std::pair<const char *, float>>>> | |
| 222 segmented_timesteps; | |
| 223 // Symbolchoices acquired during CTC | |
| 224 std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices; | |
| 225 // Stores if the timestep vector starts with a space | |
| 226 bool leading_space = false; | |
| 227 // Stores value when the word ends | |
| 228 int end = 0; | |
| 229 // Ratings matrix contains classifier choices for each classified combination | |
| 230 // of blobs. The dimension is the same as the number of blobs in chopped_word | |
| 231 // and the leading diagonal corresponds to classifier results of the blobs | |
| 232 // in chopped_word. The state_ members of best_choice, raw_choice and | |
| 233 // best_choices all correspond to this ratings matrix and allow extraction | |
| 234 // of the blob choices for any given WERD_CHOICE. | |
| 235 MATRIX *ratings = nullptr; // Owned pointer. | |
| 236 // Pointer to the first WERD_CHOICE in best_choices. This is the result that | |
| 237 // will be output from Tesseract. Note that this is now a borrowed pointer | |
| 238 // and should NOT be deleted. | |
| 239 WERD_CHOICE *best_choice = nullptr; // Borrowed pointer. | |
| 240 // The best raw_choice found during segmentation search. Differs from the | |
| 241 // best_choice by being the best result according to just the character | |
| 242 // classifier, not taking any language model information into account. | |
| 243 // Unlike best_choice, the pointer IS owned by this WERD_RES. | |
| 244 WERD_CHOICE *raw_choice = nullptr; // Owned pointer. | |
| 245 // Alternative results found during chopping/segmentation search stages. | |
| 246 // Note that being an ELIST, best_choices owns the WERD_CHOICEs. | |
| 247 WERD_CHOICE_LIST best_choices; | |
| 248 | |
| 249 // Truth bounding boxes, text and incorrect choice reason. | |
| 250 BlamerBundle *blamer_bundle = nullptr; | |
| 251 | |
| 252 // --------------OUTPUT FROM RECOGNITION------------------------------- | |
| 253 // --------------Not all fields are necessarily set.------------------- | |
| 254 // ---best_choice, raw_choice *must* end up set, with a box_word------- | |
| 255 // ---In complete output, the number of blobs in rebuild_word matches--- | |
| 256 // ---the number of boxes in box_word, the number of unichar_ids in--- | |
| 257 // ---best_choice, the number of ints in best_state, and the number--- | |
| 258 // ---of strings in correct_text-------------------------------------- | |
| 259 // ---SetupFake Sets everything to appropriate values if the word is--- | |
| 260 // ---known to be bad before recognition.------------------------------ | |
| 261 | |
| 262 // The rebuild_word is also in BLN space, but represents the final best | |
| 263 // segmentation of the word. Its length is therefore the same as box_word. | |
| 264 TWERD *rebuild_word = nullptr; // BLN best segmented word. | |
| 265 // The box_word is in the original image coordinate space. It is the | |
| 266 // bounding boxes of the rebuild_word, after denormalization. | |
| 267 // The length of box_word matches rebuild_word, best_state (if set) and | |
| 268 // correct_text (if set), as well as best_choice and represents the | |
| 269 // number of classified units in the output. | |
| 270 tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes. | |
| 271 // The Tesseract that was used to recognize this word. Just a borrowed | |
| 272 // pointer. Note: Tesseract's class definition is in a higher-level library. | |
| 273 // We avoid introducing a cyclic dependency by not using the Tesseract | |
| 274 // within WERD_RES. We are just storing it to provide access to it | |
| 275 // for the top-level multi-language controller, and maybe for output of | |
| 276 // the recognized language. | |
| 277 // tesseract points to data owned elsewhere. | |
| 278 tesseract::Tesseract *tesseract = nullptr; | |
| 279 // The best_state stores the relationship between chopped_word and | |
| 280 // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i] | |
| 281 // adjacent blobs in chopped_word. The seams in seam_array are hidden | |
| 282 // within a rebuild_word blob and revealed between them. | |
| 283 std::vector<int> best_state; // Number of blobs in each best blob. | |
| 284 // The correct_text is used during training and adaption to carry the | |
| 285 // text to the training system without the need for a unicharset. There | |
| 286 // is one entry in the vector for each blob in rebuild_word and box_word. | |
| 287 std::vector<std::string> correct_text; | |
| 288 | |
| 289 // Less-well documented members. | |
| 290 // TODO(rays) Add more documentation here. | |
| 291 WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this. | |
| 292 REJMAP reject_map; // best_choice rejects | |
| 293 bool tess_failed = false; | |
| 294 /* | |
| 295 If tess_failed is true, one of the following tests failed when Tess | |
| 296 returned: | |
| 297 - The outword blob list was not the same length as the best_choice string; | |
| 298 - The best_choice string contained ALL blanks; | |
| 299 - The best_choice string was zero length | |
| 300 */ | |
| 301 bool tess_accepted = false; // Tess thinks its ok? | |
| 302 bool tess_would_adapt = false; // Tess would adapt? | |
| 303 bool done = false; // ready for output? | |
| 304 bool small_caps = false; // word appears to be small caps | |
| 305 bool odd_size = false; // word is bigger than line or leader dots. | |
| 306 // The fontinfos are pointers to data owned by the classifier. | |
| 307 const FontInfo *fontinfo = nullptr; | |
| 308 const FontInfo *fontinfo2 = nullptr; | |
| 309 int8_t fontinfo_id_count = 0; // number of votes | |
| 310 int8_t fontinfo_id2_count = 0; // number of votes | |
| 311 bool guessed_x_ht = true; | |
| 312 bool guessed_caps_ht = true; | |
| 313 CRUNCH_MODE unlv_crunch_mode = CR_NONE; | |
| 314 float x_height = 0.0f; // post match estimate | |
| 315 float caps_height = 0.0f; // post match estimate | |
| 316 float baseline_shift = 0.0f; // post match estimate. | |
| 317 // Certainty score for the spaces either side of this word (LSTM mode). | |
| 318 // MIN this value with the actual word certainty. | |
| 319 float space_certainty = 0.0f; | |
| 320 | |
| 321 /* | |
| 322 To deal with fuzzy spaces we need to be able to combine "words" to form | |
| 323 combinations when we suspect that the gap is a non-space. The (new) text | |
| 324 ord code generates separate words for EVERY fuzzy gap - flags in the word | |
| 325 indicate whether the gap is below the threshold (fuzzy kern) and is thus | |
| 326 NOT a real word break by default, or above the threshold (fuzzy space) and | |
| 327 this is a real word break by default. | |
| 328 | |
| 329 The WERD_RES list contains all these words PLUS "combination" words built | |
| 330 out of (copies of) the words split by fuzzy kerns. The separate parts have | |
| 331 their "part_of_combo" flag set true and should be IGNORED on a default | |
| 332 reading of the list. | |
| 333 | |
| 334 Combination words are FOLLOWED by the sequence of part_of_combo words | |
| 335 which they combine. | |
| 336 */ | |
| 337 bool combination = false; // of two fuzzy gap wds | |
| 338 bool part_of_combo = false; // part of a combo | |
| 339 bool reject_spaces = false; // Reject spacing? | |
| 340 | |
| 341 WERD_RES() = default; | |
| 342 | |
| 343 WERD_RES(WERD *the_word) { | |
| 344 word = the_word; | |
| 345 } | |
| 346 // Deep copies everything except the ratings MATRIX. | |
| 347 // To get that use deep_copy below. | |
| 348 WERD_RES(const WERD_RES &source) : ELIST_LINK(source) { | |
| 349 // combination is used in function Clear which is called from operator=. | |
| 350 combination = false; | |
| 351 *this = source; // see operator= | |
| 352 } | |
| 353 | |
| 354 ~WERD_RES(); | |
| 355 | |
| 356 // Returns the UTF-8 string for the given blob index in the best_choice word, | |
| 357 // given that we know whether we are in a right-to-left reading context. | |
| 358 // This matters for mirrorable characters such as parentheses. We recognize | |
| 359 // characters purely based on their shape on the page, and by default produce | |
| 360 // the corresponding unicode for a left-to-right context. | |
| 361 const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const { | |
| 362 if (best_choice == nullptr || blob_index >= best_choice->length()) { | |
| 363 return nullptr; | |
| 364 } | |
| 365 UNICHAR_ID id = best_choice->unichar_id(blob_index); | |
| 366 if (static_cast<unsigned>(id) >= uch_set->size()) { | |
| 367 return nullptr; | |
| 368 } | |
| 369 UNICHAR_ID mirrored = uch_set->get_mirror(id); | |
| 370 if (in_rtl_context && mirrored > 0) { | |
| 371 id = mirrored; | |
| 372 } | |
| 373 return uch_set->id_to_unichar_ext(id); | |
| 374 } | |
| 375 // Returns the UTF-8 string for the given blob index in the raw_choice word. | |
| 376 const char *RawUTF8(unsigned blob_index) const { | |
| 377 if (blob_index >= raw_choice->length()) { | |
| 378 return nullptr; | |
| 379 } | |
| 380 UNICHAR_ID id = raw_choice->unichar_id(blob_index); | |
| 381 if (static_cast<unsigned>(id) >= uch_set->size()) { | |
| 382 return nullptr; | |
| 383 } | |
| 384 return uch_set->id_to_unichar(id); | |
| 385 } | |
| 386 | |
| 387 UNICHARSET::Direction SymbolDirection(unsigned blob_index) const { | |
| 388 if (best_choice == nullptr || blob_index >= best_choice->length()) { | |
| 389 return UNICHARSET::U_OTHER_NEUTRAL; | |
| 390 } | |
| 391 return uch_set->get_direction(best_choice->unichar_id(blob_index)); | |
| 392 } | |
| 393 | |
| 394 bool AnyRtlCharsInWord() const { | |
| 395 if (uch_set == nullptr || best_choice == nullptr || | |
| 396 best_choice->length() < 1) { | |
| 397 return false; | |
| 398 } | |
| 399 for (unsigned id = 0; id < best_choice->length(); id++) { | |
| 400 unsigned unichar_id = best_choice->unichar_id(id); | |
| 401 if (unichar_id >= uch_set->size()) { | |
| 402 continue; // Ignore illegal chars. | |
| 403 } | |
| 404 UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); | |
| 405 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || | |
| 406 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { | |
| 407 return true; | |
| 408 } | |
| 409 } | |
| 410 return false; | |
| 411 } | |
| 412 | |
| 413 bool AnyLtrCharsInWord() const { | |
| 414 if (uch_set == nullptr || best_choice == nullptr || | |
| 415 best_choice->length() < 1) { | |
| 416 return false; | |
| 417 } | |
| 418 for (unsigned id = 0; id < best_choice->length(); id++) { | |
| 419 unsigned unichar_id = best_choice->unichar_id(id); | |
| 420 if (unichar_id >= uch_set->size()) { | |
| 421 continue; // Ignore illegal chars. | |
| 422 } | |
| 423 UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); | |
| 424 if (dir == UNICHARSET::U_LEFT_TO_RIGHT || | |
| 425 dir == UNICHARSET::U_ARABIC_NUMBER) { | |
| 426 return true; | |
| 427 } | |
| 428 } | |
| 429 return false; | |
| 430 } | |
| 431 | |
| 432 // Return whether the blobs in this WERD_RES 0, 1,... come from an engine | |
| 433 // that gave us the unichars in reading order (as opposed to strict left | |
| 434 // to right). | |
| 435 bool UnicharsInReadingOrder() const { | |
| 436 return best_choice->unichars_in_script_order(); | |
| 437 } | |
| 438 | |
| 439 void Clear(); | |
| 440 void ClearResults(); | |
| 441 void ClearWordChoices(); | |
| 442 void ClearRatings(); | |
| 443 | |
| 444 // Deep copies everything except the ratings MATRIX. | |
| 445 // To get that use deep_copy below. | |
| 446 WERD_RES &operator=(const WERD_RES &source); // from this | |
| 447 | |
| 448 void CopySimpleFields(const WERD_RES &source); | |
| 449 | |
| 450 // Initializes a blank (default constructed) WERD_RES from one that has | |
| 451 // already been recognized. | |
| 452 // Use SetupFor*Recognition afterwards to complete the setup and make | |
| 453 // it ready for a retry recognition. | |
| 454 void InitForRetryRecognition(const WERD_RES &source); | |
| 455 | |
| 456 // Sets up the members used in recognition: bln_boxes, chopped_word, | |
| 457 // seam_array, denorm. Returns false if | |
| 458 // the word is empty and sets up fake results. If use_body_size is | |
| 459 // true and row->body_size is set, then body_size will be used for | |
| 460 // blob normalization instead of xheight + ascrise. This flag is for | |
| 461 // those languages that are using CJK pitch model and thus it has to | |
| 462 // be true if and only if tesseract->textord_use_cjk_fp_model is | |
| 463 // true. | |
| 464 // If allow_detailed_fx is true, the feature extractor will receive fine | |
| 465 // precision outline information, allowing smoother features and better | |
| 466 // features on low resolution images. | |
| 467 // The norm_mode sets the default mode for normalization in absence | |
| 468 // of any of the above flags. It should really be a tesseract::OcrEngineMode | |
| 469 // but is declared as int for ease of use with tessedit_ocr_engine_mode. | |
| 470 // Returns false if the word is empty and sets up fake results. | |
| 471 bool SetupForRecognition(const UNICHARSET &unicharset_in, | |
| 472 tesseract::Tesseract *tesseract, Image pix, | |
| 473 int norm_mode, const TBOX *norm_box, | |
| 474 bool numeric_mode, bool use_body_size, | |
| 475 bool allow_detailed_fx, ROW *row, | |
| 476 const BLOCK *block); | |
| 477 | |
| 478 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty | |
| 479 // accumulators from a made chopped word. We presume the fields are already | |
| 480 // empty. | |
| 481 void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in); | |
| 482 | |
| 483 // Sets up the members used in recognition for an empty recognition result: | |
| 484 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. | |
| 485 void SetupFake(const UNICHARSET &uch); | |
| 486 | |
| 487 // Set the word as having the script of the input unicharset. | |
| 488 void SetupWordScript(const UNICHARSET &unicharset_in); | |
| 489 | |
| 490 // Sets up the blamer_bundle if it is not null, using the initialized denorm. | |
| 491 void SetupBlamerBundle(); | |
| 492 | |
| 493 // Computes the blob_widths and blob_gaps from the chopped_word. | |
| 494 void SetupBlobWidthsAndGaps(); | |
| 495 | |
| 496 // Updates internal data to account for a new SEAM (chop) at the given | |
| 497 // blob_number. Fixes the ratings matrix and states in the choices, as well | |
| 498 // as the blob widths and gaps. | |
| 499 void InsertSeam(int blob_number, SEAM *seam); | |
| 500 | |
| 501 // Returns true if all the word choices except the first have adjust_factors | |
| 502 // worse than the given threshold. | |
| 503 bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const; | |
| 504 | |
| 505 // Returns true if the current word is ambiguous (by number of answers or | |
| 506 // by dangerous ambigs.) | |
| 507 bool IsAmbiguous(); | |
| 508 | |
| 509 // Returns true if the ratings matrix size matches the sum of each of the | |
| 510 // segmentation states. | |
| 511 bool StatesAllValid(); | |
| 512 | |
| 513 // Prints a list of words found if debug is true or the word result matches | |
| 514 // the word_to_debug. | |
| 515 void DebugWordChoices(bool debug, const char *word_to_debug); | |
| 516 | |
| 517 // Prints the top choice along with the accepted/done flags. | |
| 518 void DebugTopChoice(const char *msg) const; | |
| 519 | |
| 520 // Removes from best_choices all choices which are not within a reasonable | |
| 521 // range of the best choice. | |
| 522 void FilterWordChoices(int debug_level); | |
| 523 | |
| 524 // Computes a set of distance thresholds used to control adaption. | |
| 525 // Compares the best choice for the current word to the best raw choice | |
| 526 // to determine which characters were classified incorrectly by the | |
| 527 // classifier. Then places a separate threshold into thresholds for each | |
| 528 // character in the word. If the classifier was correct, max_rating is placed | |
| 529 // into thresholds. If the classifier was incorrect, the mean match rating | |
| 530 // (error percentage) of the classifier's incorrect choice minus some margin | |
| 531 // is placed into thresholds. This can then be used by the caller to try to | |
| 532 // create a new template for the desired class that will classify the | |
| 533 // character with a rating better than the threshold value. The match rating | |
| 534 // placed into thresholds is never allowed to be below min_rating in order to | |
| 535 // prevent trying to make overly tight templates. | |
| 536 // min_rating limits how tight to make a template. | |
| 537 // max_rating limits how loose to make a template. | |
| 538 // rating_margin denotes the amount of margin to put in template. | |
| 539 void ComputeAdaptionThresholds(float certainty_scale, float min_rating, | |
| 540 float max_rating, float rating_margin, | |
| 541 float *thresholds); | |
| 542 | |
| 543 // Saves a copy of the word_choice if it has the best unadjusted rating. | |
| 544 // Returns true if the word_choice was the new best. | |
| 545 bool LogNewRawChoice(WERD_CHOICE *word_choice); | |
| 546 // Consumes word_choice by adding it to best_choices, (taking ownership) if | |
| 547 // the certainty for word_choice is some distance of the best choice in | |
| 548 // best_choices, or by deleting the word_choice and returning false. | |
| 549 // The best_choices list is kept in sorted order by rating. Duplicates are | |
| 550 // removed, and the list is kept no longer than max_num_choices in length. | |
| 551 // Returns true if the word_choice is still a valid pointer. | |
| 552 bool LogNewCookedChoice(int max_num_choices, bool debug, | |
| 553 WERD_CHOICE *word_choice); | |
| 554 | |
| 555 // Prints a brief list of all the best choices. | |
| 556 void PrintBestChoices() const; | |
| 557 | |
| 558 // Returns the sum of the widths of the blob between start_blob and last_blob | |
| 559 // inclusive. | |
| 560 int GetBlobsWidth(int start_blob, int last_blob) const; | |
| 561 // Returns the width of a gap between the specified blob and the next one. | |
| 562 int GetBlobsGap(unsigned blob_index) const; | |
| 563 | |
| 564 // Returns the BLOB_CHOICE corresponding to the given index in the | |
| 565 // best choice word taken from the appropriate cell in the ratings MATRIX. | |
| 566 // Borrowed pointer, so do not delete. May return nullptr if there is no | |
| 567 // BLOB_CHOICE matching the unichar_id at the given index. | |
| 568 BLOB_CHOICE *GetBlobChoice(unsigned index) const; | |
| 569 | |
| 570 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the | |
| 571 // best choice word taken from the appropriate cell in the ratings MATRIX. | |
| 572 // Borrowed pointer, so do not delete. | |
| 573 BLOB_CHOICE_LIST *GetBlobChoices(int index) const; | |
| 574 | |
| 575 // Moves the results fields from word to this. This takes ownership of all | |
| 576 // the data, so src can be destructed. | |
| 577 // word1.ConsumeWordResult(word); | |
| 578 // delete word; | |
| 579 // is simpler and faster than: | |
| 580 // word1 = *word; | |
| 581 // delete word; | |
| 582 // as it doesn't need to copy and reallocate anything. | |
| 583 void ConsumeWordResults(WERD_RES *word); | |
| 584 | |
| 585 // Replace the best choice and rebuild box word. | |
| 586 // choice must be from the current best_choices list. | |
| 587 void ReplaceBestChoice(WERD_CHOICE *choice); | |
| 588 | |
| 589 // Builds the rebuild_word and sets the best_state from the chopped_word and | |
| 590 // the best_choice->state. | |
| 591 void RebuildBestState(); | |
| 592 | |
| 593 // Copies the chopped_word to the rebuild_word, faking a best_state as well. | |
| 594 // Also sets up the output box_word. | |
| 595 void CloneChoppedToRebuild(); | |
| 596 | |
| 597 // Sets/replaces the box_word with one made from the rebuild_word. | |
| 598 void SetupBoxWord(); | |
| 599 | |
| 600 // Sets up the script positions in the best_choice using the best_choice | |
| 601 // to get the unichars, and the unicharset to get the target positions. | |
| 602 void SetScriptPositions(); | |
| 603 // Sets all the blobs in all the words (best choice and alternates) to be | |
| 604 // the given position. (When a sub/superscript is recognized as a separate | |
| 605 // word, it falls victim to the rule that a whole word cannot be sub or | |
| 606 // superscript, so this function overrides that problem.) | |
| 607 void SetAllScriptPositions(tesseract::ScriptPos position); | |
| 608 | |
| 609 // Classifies the word with some already-calculated BLOB_CHOICEs. | |
| 610 // The choices are an array of blob_count pointers to BLOB_CHOICE, | |
| 611 // providing a single classifier result for each blob. | |
| 612 // The BLOB_CHOICEs are consumed and the word takes ownership. | |
| 613 // The number of blobs in the box_word must match blob_count. | |
| 614 void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices); | |
| 615 | |
| 616 // Creates a WERD_CHOICE for the word using the top choices from the leading | |
| 617 // diagonal of the ratings matrix. | |
| 618 void FakeWordFromRatings(PermuterType permuter); | |
| 619 | |
| 620 // Copies the best_choice strings to the correct_text for adaption/training. | |
| 621 void BestChoiceToCorrectText(); | |
| 622 | |
| 623 // Merges 2 adjacent blobs in the result if the permanent callback | |
| 624 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent | |
| 625 // callback box_cb is nullptr or returns true, setting the merged blob | |
| 626 // result to the class returned from class_cb. | |
| 627 // Returns true if anything was merged. | |
| 628 bool ConditionalBlobMerge( | |
| 629 const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, | |
| 630 const std::function<bool(const TBOX &, const TBOX &)> &box_cb); | |
| 631 | |
| 632 // Merges 2 adjacent blobs in the result (index and index+1) and corrects | |
| 633 // all the data to account for the change. | |
| 634 void MergeAdjacentBlobs(unsigned index); | |
| 635 | |
| 636 // Callback helper for fix_quotes returns a double quote if both | |
| 637 // arguments are quote, otherwise INVALID_UNICHAR_ID. | |
| 638 UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); | |
| 639 void fix_quotes(); | |
| 640 | |
| 641 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both | |
| 642 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. | |
| 643 UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); | |
| 644 // Callback helper for fix_hyphens returns true if box1 and box2 overlap | |
| 645 // (assuming both on the same textline, are in order and a chopped em dash.) | |
| 646 bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2); | |
| 647 void fix_hyphens(); | |
| 648 | |
| 649 // Callback helper for merge_tess_fails returns a space if both | |
| 650 // arguments are space, otherwise INVALID_UNICHAR_ID. | |
| 651 UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); | |
| 652 void merge_tess_fails(); | |
| 653 | |
| 654 // Returns a really deep copy of *src, including the ratings MATRIX. | |
| 655 static WERD_RES *deep_copy(const WERD_RES *src) { | |
| 656 auto *result = new WERD_RES(*src); | |
| 657 // That didn't copy the ratings, but we want a copy if there is one to | |
| 658 // begin with. | |
| 659 if (src->ratings != nullptr) { | |
| 660 result->ratings = src->ratings->DeepCopy(); | |
| 661 } | |
| 662 return result; | |
| 663 } | |
| 664 | |
| 665 // Copy blobs from word_res onto this word (eliminating spaces between). | |
| 666 // Since this may be called bidirectionally OR both the BOL and EOL flags. | |
| 667 void copy_on(WERD_RES *word_res) { // from this word | |
| 668 word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL)); | |
| 669 word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL)); | |
| 670 word->copy_on(word_res->word); | |
| 671 } | |
| 672 | |
| 673 // Returns true if the collection of count pieces, starting at start, are all | |
| 674 // natural connected components, ie there are no real chops involved. | |
| 675 bool PiecesAllNatural(int start, int count) const; | |
| 676 }; | |
| 677 | |
| 678 /************************************************************************* | |
| 679 * PAGE_RES_IT - Page results iterator | |
| 680 *************************************************************************/ | |
| 681 | |
| 682 class TESS_API PAGE_RES_IT { | |
| 683 public: | |
| 684 PAGE_RES *page_res; // page being iterated | |
| 685 | |
| 686 PAGE_RES_IT() = default; | |
| 687 | |
| 688 PAGE_RES_IT(PAGE_RES *the_page_res) { // page result | |
| 689 page_res = the_page_res; | |
| 690 restart_page(); // ready to scan | |
| 691 } | |
| 692 | |
| 693 // Do two PAGE_RES_ITs point at the same word? | |
| 694 // This is much cheaper than cmp(). | |
| 695 bool operator==(const PAGE_RES_IT &other) const { | |
| 696 return word_res == other.word_res && row_res == other.row_res && | |
| 697 block_res == other.block_res; | |
| 698 } | |
| 699 | |
| 700 bool operator!=(const PAGE_RES_IT &other) const { | |
| 701 return !(*this == other); | |
| 702 } | |
| 703 | |
| 704 // Given another PAGE_RES_IT to the same page, | |
| 705 // this before other: -1 | |
| 706 // this equal to other: 0 | |
| 707 // this later than other: 1 | |
| 708 int cmp(const PAGE_RES_IT &other) const; | |
| 709 | |
| 710 WERD_RES *restart_page() { | |
| 711 return start_page(false); // Skip empty blocks. | |
| 712 } | |
| 713 WERD_RES *restart_page_with_empties() { | |
| 714 return start_page(true); // Allow empty blocks. | |
| 715 } | |
| 716 WERD_RES *start_page(bool empty_ok); | |
| 717 | |
| 718 WERD_RES *restart_row(); | |
| 719 | |
| 720 // ============ Methods that mutate the underling structures =========== | |
| 721 // Note that these methods will potentially invalidate other PAGE_RES_ITs | |
| 722 // and are intended to be used only while a single PAGE_RES_IT is active. | |
| 723 // This problem needs to be taken into account if these mutation operators | |
| 724 // are ever provided to PageIterator or its subclasses. | |
| 725 | |
| 726 // Inserts the new_word and a corresponding WERD_RES before the current | |
| 727 // position. The simple fields of the WERD_RES are copied from clone_res and | |
| 728 // the resulting WERD_RES is returned for further setup with best_choice etc. | |
| 729 WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word); | |
| 730 | |
| 731 // Replaces the current WERD/WERD_RES with the given words. The given words | |
| 732 // contain fake blobs that indicate the position of the characters. These are | |
| 733 // replaced with real blobs from the current word as much as possible. | |
| 734 void ReplaceCurrentWord(PointerVector<WERD_RES> *words); | |
| 735 | |
| 736 // Deletes the current WERD_RES and its underlying WERD. | |
| 737 void DeleteCurrentWord(); | |
| 738 | |
| 739 // Makes the current word a fuzzy space if not already fuzzy. Updates | |
| 740 // corresponding part of combo if required. | |
| 741 void MakeCurrentWordFuzzy(); | |
| 742 | |
| 743 WERD_RES *forward() { // Get next word. | |
| 744 return internal_forward(false, false); | |
| 745 } | |
| 746 // Move forward, but allow empty blocks to show as single nullptr words. | |
| 747 WERD_RES *forward_with_empties() { | |
| 748 return internal_forward(false, true); | |
| 749 } | |
| 750 | |
| 751 WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph | |
| 752 WERD_RES *forward_block(); // get first word in next non-empty block | |
| 753 | |
| 754 WERD_RES *prev_word() const { // previous word | |
| 755 return prev_word_res; | |
| 756 } | |
| 757 ROW_RES *prev_row() const { // row of prev word | |
| 758 return prev_row_res; | |
| 759 } | |
| 760 BLOCK_RES *prev_block() const { // block of prev word | |
| 761 return prev_block_res; | |
| 762 } | |
| 763 WERD_RES *word() const { // current word | |
| 764 return word_res; | |
| 765 } | |
| 766 ROW_RES *row() const { // row of current word | |
| 767 return row_res; | |
| 768 } | |
| 769 BLOCK_RES *block() const { // block of cur. word | |
| 770 return block_res; | |
| 771 } | |
| 772 WERD_RES *next_word() const { // next word | |
| 773 return next_word_res; | |
| 774 } | |
| 775 ROW_RES *next_row() const { // row of next word | |
| 776 return next_row_res; | |
| 777 } | |
| 778 BLOCK_RES *next_block() const { // block of next word | |
| 779 return next_block_res; | |
| 780 } | |
| 781 void rej_stat_word(); // for page/block/row | |
| 782 void ResetWordIterator(); | |
| 783 | |
| 784 private: | |
| 785 WERD_RES *internal_forward(bool new_block, bool empty_ok); | |
| 786 | |
| 787 WERD_RES *prev_word_res; // previous word | |
| 788 ROW_RES *prev_row_res; // row of prev word | |
| 789 BLOCK_RES *prev_block_res; // block of prev word | |
| 790 | |
| 791 WERD_RES *word_res; // current word | |
| 792 ROW_RES *row_res; // row of current word | |
| 793 BLOCK_RES *block_res; // block of cur. word | |
| 794 | |
| 795 WERD_RES *next_word_res; // next word | |
| 796 ROW_RES *next_row_res; // row of next word | |
| 797 BLOCK_RES *next_block_res; // block of next word | |
| 798 | |
| 799 BLOCK_RES_IT block_res_it; // iterators | |
| 800 ROW_RES_IT row_res_it; | |
| 801 WERD_RES_IT word_res_it; | |
| 802 // Iterators used to get the state of word_res_it for the current word. | |
| 803 // Since word_res_it is 2 words further on, this is otherwise hard to do. | |
| 804 WERD_RES_IT wr_it_of_current_word; | |
| 805 WERD_RES_IT wr_it_of_next_word; | |
| 806 }; | |
| 807 | |
| 808 } // namespace tesseract | |
| 809 | |
| 810 #endif |
