Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.h @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: ratngs.h (Formerly ratings.h) | |
| 3 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #ifndef RATNGS_H | |
| 20 #define RATNGS_H | |
| 21 | |
| 22 #ifdef HAVE_CONFIG_H | |
| 23 # include "config_auto.h" // DISABLED_LEGACY_ENGINE | |
| 24 #endif | |
| 25 | |
| 26 #include "clst.h" | |
| 27 #include "elst.h" | |
| 28 #ifndef DISABLED_LEGACY_ENGINE | |
| 29 # include "fontinfo.h" | |
| 30 #endif // undef DISABLED_LEGACY_ENGINE | |
| 31 #include "matrix.h" | |
| 32 #include "unicharset.h" | |
| 33 #include "werd.h" | |
| 34 | |
| 35 #include <tesseract/unichar.h> | |
| 36 | |
| 37 #include <cassert> | |
| 38 #include <cfloat> // for FLT_MAX | |
| 39 | |
| 40 namespace tesseract { | |
| 41 | |
| 42 class MATRIX; | |
| 43 struct TBLOB; | |
| 44 struct TWERD; | |
| 45 | |
| 46 // Enum to describe the source of a BLOB_CHOICE to make it possible to determine | |
| 47 // whether a blob has been classified by inspecting the BLOB_CHOICEs. | |
| 48 enum BlobChoiceClassifier { | |
| 49 BCC_STATIC_CLASSIFIER, // From the char_norm classifier. | |
| 50 BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier. | |
| 51 BCC_SPECKLE_CLASSIFIER, // Backup for failed classification. | |
| 52 BCC_AMBIG, // Generated by ambiguity detection. | |
| 53 BCC_FAKE, // From some other process. | |
| 54 }; | |
| 55 | |
| 56 class BLOB_CHOICE : public ELIST_LINK { | |
| 57 public: | |
| 58 BLOB_CHOICE() { | |
| 59 unichar_id_ = UNICHAR_SPACE; | |
| 60 fontinfo_id_ = -1; | |
| 61 fontinfo_id2_ = -1; | |
| 62 rating_ = 10.0f; | |
| 63 certainty_ = -1.0f; | |
| 64 script_id_ = -1; | |
| 65 min_xheight_ = 0.0f; | |
| 66 max_xheight_ = 0.0f; | |
| 67 yshift_ = 0.0f; | |
| 68 classifier_ = BCC_FAKE; | |
| 69 } | |
| 70 BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id | |
| 71 float src_rating, // rating | |
| 72 float src_cert, // certainty | |
| 73 int script_id, // script | |
| 74 float min_xheight, // min xheight in image pixel units | |
| 75 float max_xheight, // max xheight allowed by this char | |
| 76 float yshift, // the larger of y shift (top or bottom) | |
| 77 BlobChoiceClassifier c); // adapted match or other | |
| 78 BLOB_CHOICE(const BLOB_CHOICE &other); | |
| 79 ~BLOB_CHOICE() = default; | |
| 80 | |
| 81 UNICHAR_ID unichar_id() const { | |
| 82 return unichar_id_; | |
| 83 } | |
| 84 float rating() const { | |
| 85 return rating_; | |
| 86 } | |
| 87 float certainty() const { | |
| 88 return certainty_; | |
| 89 } | |
| 90 int16_t fontinfo_id() const { | |
| 91 return fontinfo_id_; | |
| 92 } | |
| 93 int16_t fontinfo_id2() const { | |
| 94 return fontinfo_id2_; | |
| 95 } | |
| 96 #ifndef DISABLED_LEGACY_ENGINE | |
| 97 const std::vector<ScoredFont> &fonts() const { | |
| 98 return fonts_; | |
| 99 } | |
| 100 void set_fonts(const std::vector<ScoredFont> &fonts) { | |
| 101 fonts_ = fonts; | |
| 102 int score1 = 0, score2 = 0; | |
| 103 fontinfo_id_ = -1; | |
| 104 fontinfo_id2_ = -1; | |
| 105 for (auto &f : fonts_) { | |
| 106 if (f.score > score1) { | |
| 107 score2 = score1; | |
| 108 fontinfo_id2_ = fontinfo_id_; | |
| 109 score1 = f.score; | |
| 110 fontinfo_id_ = f.fontinfo_id; | |
| 111 } else if (f.score > score2) { | |
| 112 score2 = f.score; | |
| 113 fontinfo_id2_ = f.fontinfo_id; | |
| 114 } | |
| 115 } | |
| 116 } | |
| 117 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 118 int script_id() const { | |
| 119 return script_id_; | |
| 120 } | |
| 121 const MATRIX_COORD &matrix_cell() { | |
| 122 return matrix_cell_; | |
| 123 } | |
| 124 float min_xheight() const { | |
| 125 return min_xheight_; | |
| 126 } | |
| 127 float max_xheight() const { | |
| 128 return max_xheight_; | |
| 129 } | |
| 130 float yshift() const { | |
| 131 return yshift_; | |
| 132 } | |
| 133 BlobChoiceClassifier classifier() const { | |
| 134 return classifier_; | |
| 135 } | |
| 136 bool IsAdapted() const { | |
| 137 return classifier_ == BCC_ADAPTED_CLASSIFIER; | |
| 138 } | |
| 139 bool IsClassified() const { | |
| 140 return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER || | |
| 141 classifier_ == BCC_SPECKLE_CLASSIFIER; | |
| 142 } | |
| 143 | |
| 144 void set_unichar_id(UNICHAR_ID newunichar_id) { | |
| 145 unichar_id_ = newunichar_id; | |
| 146 } | |
| 147 void set_rating(float newrat) { | |
| 148 rating_ = newrat; | |
| 149 } | |
| 150 void set_certainty(float newrat) { | |
| 151 certainty_ = newrat; | |
| 152 } | |
| 153 void set_script(int newscript_id) { | |
| 154 script_id_ = newscript_id; | |
| 155 } | |
| 156 void set_matrix_cell(int col, int row) { | |
| 157 matrix_cell_.col = col; | |
| 158 matrix_cell_.row = row; | |
| 159 } | |
| 160 void set_classifier(BlobChoiceClassifier classifier) { | |
| 161 classifier_ = classifier; | |
| 162 } | |
| 163 static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) { | |
| 164 auto *choice = new BLOB_CHOICE; | |
| 165 *choice = *src; | |
| 166 return choice; | |
| 167 } | |
| 168 // Returns true if *this and other agree on the baseline and x-height | |
| 169 // to within some tolerance based on a given estimate of the x-height. | |
| 170 bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const; | |
| 171 | |
| 172 void print(const UNICHARSET *unicharset) const { | |
| 173 tprintf("r%.2f c%.2f x[%g,%g]: %d %s", | |
| 174 static_cast<double>(rating_), | |
| 175 static_cast<double>(certainty_), | |
| 176 static_cast<double>(min_xheight_), | |
| 177 static_cast<double>(max_xheight_), | |
| 178 unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str()); | |
| 179 } | |
| 180 void print_full() const { | |
| 181 print(nullptr); | |
| 182 tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_, | |
| 183 fontinfo_id2_, static_cast<double>(yshift_), classifier_); | |
| 184 } | |
| 185 // Sort function for sorting BLOB_CHOICEs in increasing order of rating. | |
| 186 static int SortByRating(const void *p1, const void *p2) { | |
| 187 const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1); | |
| 188 const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2); | |
| 189 return (bc1->rating_ < bc2->rating_) ? -1 : 1; | |
| 190 } | |
| 191 | |
| 192 private: | |
| 193 // Copy assignment operator. | |
| 194 BLOB_CHOICE &operator=(const BLOB_CHOICE &other); | |
| 195 | |
| 196 UNICHAR_ID unichar_id_; // unichar id | |
| 197 #ifndef DISABLED_LEGACY_ENGINE | |
| 198 // Fonts and scores. Allowed to be empty. | |
| 199 std::vector<ScoredFont> fonts_; | |
| 200 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 201 int16_t fontinfo_id_; // char font information | |
| 202 int16_t fontinfo_id2_; // 2nd choice font information | |
| 203 // Rating is the classifier distance weighted by the length of the outline | |
| 204 // in the blob. In terms of probability, classifier distance is -klog p such | |
| 205 // that the resulting distance is in the range [0, 1] and then | |
| 206 // rating = w (-k log p) where w is the weight for the length of the outline. | |
| 207 // Sums of ratings may be compared meaningfully for words of different | |
| 208 // segmentation. | |
| 209 float rating_; // size related | |
| 210 // Certainty is a number in [-20, 0] indicating the classifier certainty | |
| 211 // of the choice. In terms of probability, certainty = 20 (k log p) where | |
| 212 // k is defined as above to normalize -klog p to the range [0, 1]. | |
| 213 float certainty_; // absolute | |
| 214 int script_id_; | |
| 215 // Holds the position of this choice in the ratings matrix. | |
| 216 // Used to location position in the matrix during path backtracking. | |
| 217 MATRIX_COORD matrix_cell_; | |
| 218 // X-height range (in image pixels) that this classification supports. | |
| 219 float min_xheight_; | |
| 220 float max_xheight_; | |
| 221 // yshift_ - The vertical distance (in image pixels) the character is | |
| 222 // shifted (up or down) from an acceptable y position. | |
| 223 float yshift_; | |
| 224 BlobChoiceClassifier classifier_; // What generated *this. | |
| 225 }; | |
| 226 | |
| 227 // Make BLOB_CHOICE listable. | |
| 228 ELISTIZEH(BLOB_CHOICE) | |
| 229 | |
| 230 // Return the BLOB_CHOICE in bc_list matching a given unichar_id, | |
| 231 // or nullptr if there is no match. | |
| 232 BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list); | |
| 233 | |
| 234 // Permuter codes used in WERD_CHOICEs. | |
| 235 enum PermuterType { | |
| 236 NO_PERM, // 0 | |
| 237 PUNC_PERM, // 1 | |
| 238 TOP_CHOICE_PERM, // 2 | |
| 239 LOWER_CASE_PERM, // 3 | |
| 240 UPPER_CASE_PERM, // 4 | |
| 241 NGRAM_PERM, // 5 | |
| 242 NUMBER_PERM, // 6 | |
| 243 USER_PATTERN_PERM, // 7 | |
| 244 SYSTEM_DAWG_PERM, // 8 | |
| 245 DOC_DAWG_PERM, // 9 | |
| 246 USER_DAWG_PERM, // 10 | |
| 247 FREQ_DAWG_PERM, // 11 | |
| 248 COMPOUND_PERM, // 12 | |
| 249 | |
| 250 NUM_PERMUTER_TYPES | |
| 251 }; | |
| 252 | |
| 253 // ScriptPos tells whether a character is subscript, superscript or normal. | |
| 254 enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }; | |
| 255 | |
| 256 const char *ScriptPosToString(ScriptPos script_pos); | |
| 257 | |
| 258 class TESS_API WERD_CHOICE : public ELIST_LINK { | |
| 259 public: | |
| 260 static const float kBadRating; | |
| 261 static const char *permuter_name(uint8_t permuter); | |
| 262 | |
| 263 WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) { | |
| 264 this->init(8); | |
| 265 } | |
| 266 WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) { | |
| 267 this->init(reserved); | |
| 268 } | |
| 269 WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating, | |
| 270 float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset) | |
| 271 : unicharset_(&unicharset) { | |
| 272 this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter); | |
| 273 } | |
| 274 WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset); | |
| 275 WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) { | |
| 276 this->init(word.length()); | |
| 277 this->operator=(word); | |
| 278 } | |
| 279 ~WERD_CHOICE(); | |
| 280 | |
| 281 const UNICHARSET *unicharset() const { | |
| 282 return unicharset_; | |
| 283 } | |
| 284 bool empty() const { | |
| 285 return length_ == 0; | |
| 286 } | |
| 287 inline unsigned length() const { | |
| 288 return length_; | |
| 289 } | |
| 290 float adjust_factor() const { | |
| 291 return adjust_factor_; | |
| 292 } | |
| 293 void set_adjust_factor(float factor) { | |
| 294 adjust_factor_ = factor; | |
| 295 } | |
| 296 inline const std::vector<UNICHAR_ID> &unichar_ids() const { | |
| 297 return unichar_ids_; | |
| 298 } | |
| 299 inline UNICHAR_ID unichar_id(unsigned index) const { | |
| 300 assert(index < length_); | |
| 301 return unichar_ids_[index]; | |
| 302 } | |
| 303 inline unsigned state(unsigned index) const { | |
| 304 return state_[index]; | |
| 305 } | |
| 306 ScriptPos BlobPosition(unsigned index) const { | |
| 307 if (index >= length_) { | |
| 308 return SP_NORMAL; | |
| 309 } | |
| 310 return script_pos_[index]; | |
| 311 } | |
| 312 inline float rating() const { | |
| 313 return rating_; | |
| 314 } | |
| 315 inline float certainty() const { | |
| 316 return certainty_; | |
| 317 } | |
| 318 inline float certainty(unsigned index) const { | |
| 319 return certainties_[index]; | |
| 320 } | |
| 321 inline float min_x_height() const { | |
| 322 return min_x_height_; | |
| 323 } | |
| 324 inline float max_x_height() const { | |
| 325 return max_x_height_; | |
| 326 } | |
| 327 inline void set_x_heights(float min_height, float max_height) { | |
| 328 min_x_height_ = min_height; | |
| 329 max_x_height_ = max_height; | |
| 330 } | |
| 331 inline uint8_t permuter() const { | |
| 332 return permuter_; | |
| 333 } | |
| 334 const char *permuter_name() const; | |
| 335 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, | |
| 336 // taken from the appropriate cell in the ratings MATRIX. | |
| 337 // Borrowed pointer, so do not delete. | |
| 338 BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const; | |
| 339 | |
| 340 // Returns the MATRIX_COORD corresponding to the location in the ratings | |
| 341 // MATRIX for the given index into the word. | |
| 342 MATRIX_COORD MatrixCoord(unsigned index) const; | |
| 343 | |
| 344 inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) { | |
| 345 assert(index < length_); | |
| 346 unichar_ids_[index] = unichar_id; | |
| 347 } | |
| 348 bool dangerous_ambig_found() const { | |
| 349 return dangerous_ambig_found_; | |
| 350 } | |
| 351 void set_dangerous_ambig_found_(bool value) { | |
| 352 dangerous_ambig_found_ = value; | |
| 353 } | |
| 354 inline void set_rating(float new_val) { | |
| 355 rating_ = new_val; | |
| 356 } | |
| 357 inline void set_certainty(float new_val) { | |
| 358 certainty_ = new_val; | |
| 359 } | |
| 360 inline void set_permuter(uint8_t perm) { | |
| 361 permuter_ = perm; | |
| 362 } | |
| 363 // Note: this function should only be used if all the fields | |
| 364 // are populated manually with set_* functions (rather than | |
| 365 // (copy)constructors and append_* functions). | |
| 366 inline void set_length(unsigned len) { | |
| 367 ASSERT_HOST(reserved_ >= len); | |
| 368 length_ = len; | |
| 369 } | |
| 370 | |
| 371 /// Make more space in unichar_id_ and fragment_lengths_ arrays. | |
| 372 inline void double_the_size() { | |
| 373 if (reserved_ > 0) { | |
| 374 reserved_ *= 2; | |
| 375 } else { | |
| 376 reserved_ = 1; | |
| 377 } | |
| 378 unichar_ids_.resize(reserved_); | |
| 379 script_pos_.resize(reserved_); | |
| 380 state_.resize(reserved_); | |
| 381 certainties_.resize(reserved_); | |
| 382 } | |
| 383 | |
| 384 /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and | |
| 385 /// fragment_length_ arrays. Sets other values to default (blank) values. | |
| 386 inline void init(unsigned reserved) { | |
| 387 reserved_ = reserved; | |
| 388 if (reserved > 0) { | |
| 389 unichar_ids_.resize(reserved); | |
| 390 script_pos_.resize(reserved); | |
| 391 state_.resize(reserved); | |
| 392 certainties_.resize(reserved); | |
| 393 } else { | |
| 394 unichar_ids_.clear(); | |
| 395 script_pos_.clear(); | |
| 396 state_.clear(); | |
| 397 certainties_.clear(); | |
| 398 } | |
| 399 length_ = 0; | |
| 400 adjust_factor_ = 1.0f; | |
| 401 rating_ = 0.0; | |
| 402 certainty_ = FLT_MAX; | |
| 403 min_x_height_ = 0.0f; | |
| 404 max_x_height_ = FLT_MAX; | |
| 405 permuter_ = NO_PERM; | |
| 406 unichars_in_script_order_ = false; // Tesseract is strict left-to-right. | |
| 407 dangerous_ambig_found_ = false; | |
| 408 } | |
| 409 | |
| 410 /// Helper function to build a WERD_CHOICE from the given string, | |
| 411 /// fragment lengths, rating, certainty and permuter. | |
| 412 /// The function assumes that src_string is not nullptr. | |
| 413 /// src_lengths argument could be nullptr, in which case the unichars | |
| 414 /// in src_string are assumed to all be of length 1. | |
| 415 void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty, | |
| 416 uint8_t src_permuter); | |
| 417 | |
| 418 /// Set the fields in this choice to be default (bad) values. | |
| 419 inline void make_bad() { | |
| 420 length_ = 0; | |
| 421 rating_ = kBadRating; | |
| 422 certainty_ = -FLT_MAX; | |
| 423 } | |
| 424 | |
| 425 /// This function assumes that there is enough space reserved | |
| 426 /// in the WERD_CHOICE for adding another unichar. | |
| 427 /// This is an efficient alternative to append_unichar_id(). | |
| 428 inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, | |
| 429 float certainty) { | |
| 430 assert(reserved_ > length_); | |
| 431 length_++; | |
| 432 this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1); | |
| 433 } | |
| 434 | |
| 435 void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty); | |
| 436 | |
| 437 inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, | |
| 438 unsigned index) { | |
| 439 assert(index < length_); | |
| 440 unichar_ids_[index] = unichar_id; | |
| 441 state_[index] = blob_count; | |
| 442 certainties_[index] = certainty; | |
| 443 script_pos_[index] = SP_NORMAL; | |
| 444 rating_ += rating; | |
| 445 if (certainty < certainty_) { | |
| 446 certainty_ = certainty; | |
| 447 } | |
| 448 } | |
| 449 // Sets the entries for the given index from the BLOB_CHOICE, assuming | |
| 450 // unit fragment lengths, but setting the state for this index to blob_count. | |
| 451 void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice); | |
| 452 | |
| 453 bool contains_unichar_id(UNICHAR_ID unichar_id) const; | |
| 454 void remove_unichar_ids(unsigned index, int num); | |
| 455 inline void remove_last_unichar_id() { | |
| 456 --length_; | |
| 457 } | |
| 458 inline void remove_unichar_id(unsigned index) { | |
| 459 this->remove_unichar_ids(index, 1); | |
| 460 } | |
| 461 bool has_rtl_unichar_id() const; | |
| 462 void reverse_and_mirror_unichar_ids(); | |
| 463 | |
| 464 // Returns the half-open interval of unichar_id indices [start, end) which | |
| 465 // enclose the core portion of this word -- the part after stripping | |
| 466 // punctuation from the left and right. | |
| 467 void punct_stripped(unsigned *start_core, unsigned *end_core) const; | |
| 468 | |
| 469 // Returns the indices [start, end) containing the core of the word, stripped | |
| 470 // of any superscript digits on either side. (i.e., the non-footnote part | |
| 471 // of the word). There is no guarantee that the output range is non-empty. | |
| 472 void GetNonSuperscriptSpan(int *start, int *end) const; | |
| 473 | |
| 474 // Return a copy of this WERD_CHOICE with the choices [start, end). | |
| 475 // The result is useful only for checking against a dictionary. | |
| 476 WERD_CHOICE shallow_copy(unsigned start, unsigned end) const; | |
| 477 | |
| 478 void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const; | |
| 479 std::string debug_string() const { | |
| 480 std::string word_str; | |
| 481 for (unsigned i = 0; i < length_; ++i) { | |
| 482 word_str += unicharset_->debug_str(unichar_ids_[i]); | |
| 483 word_str += " "; | |
| 484 } | |
| 485 return word_str; | |
| 486 } | |
| 487 // Returns true if any unichar_id in the word is a non-space-delimited char. | |
| 488 bool ContainsAnyNonSpaceDelimited() const { | |
| 489 for (unsigned i = 0; i < length_; ++i) { | |
| 490 if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) { | |
| 491 return true; | |
| 492 } | |
| 493 } | |
| 494 return false; | |
| 495 } | |
| 496 // Returns true if the word is all spaces. | |
| 497 bool IsAllSpaces() const { | |
| 498 for (unsigned i = 0; i < length_; ++i) { | |
| 499 if (unichar_ids_[i] != UNICHAR_SPACE) { | |
| 500 return false; | |
| 501 } | |
| 502 } | |
| 503 return true; | |
| 504 } | |
| 505 | |
| 506 // Call this to override the default (strict left to right graphemes) | |
| 507 // with the fact that some engine produces a "reading order" set of | |
| 508 // Graphemes for each word. | |
| 509 bool set_unichars_in_script_order(bool in_script_order) { | |
| 510 return unichars_in_script_order_ = in_script_order; | |
| 511 } | |
| 512 | |
| 513 bool unichars_in_script_order() const { | |
| 514 return unichars_in_script_order_; | |
| 515 } | |
| 516 | |
| 517 // Returns a UTF-8 string equivalent to the current choice | |
| 518 // of UNICHAR IDs. | |
| 519 std::string &unichar_string() { | |
| 520 this->string_and_lengths(&unichar_string_, &unichar_lengths_); | |
| 521 return unichar_string_; | |
| 522 } | |
| 523 | |
| 524 // Returns a UTF-8 string equivalent to the current choice | |
| 525 // of UNICHAR IDs. | |
| 526 const std::string &unichar_string() const { | |
| 527 this->string_and_lengths(&unichar_string_, &unichar_lengths_); | |
| 528 return unichar_string_; | |
| 529 } | |
| 530 | |
| 531 // Returns the lengths, one byte each, representing the number of bytes | |
| 532 // required in the unichar_string for each UNICHAR_ID. | |
| 533 const std::string &unichar_lengths() const { | |
| 534 this->string_and_lengths(&unichar_string_, &unichar_lengths_); | |
| 535 return unichar_lengths_; | |
| 536 } | |
| 537 | |
| 538 // Sets up the script_pos_ member using the blobs_list to get the bln | |
| 539 // bounding boxes, *this to get the unichars, and this->unicharset | |
| 540 // to get the target positions. If small_caps is true, sub/super are not | |
| 541 // considered, but dropcaps are. | |
| 542 // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.) | |
| 543 void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0); | |
| 544 // Sets all the script_pos_ positions to the given position. | |
| 545 void SetAllScriptPositions(ScriptPos position); | |
| 546 | |
| 547 static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, | |
| 548 const TBOX &blob_box, UNICHAR_ID unichar_id); | |
| 549 | |
| 550 // Returns the "dominant" script ID for the word. By "dominant", the script | |
| 551 // must account for at least half the characters. Otherwise, it returns 0. | |
| 552 // Note that for Japanese, Hiragana and Katakana are simply treated as Han. | |
| 553 int GetTopScriptID() const; | |
| 554 | |
| 555 // Fixes the state_ for a chop at the given blob_posiiton. | |
| 556 void UpdateStateForSplit(int blob_position); | |
| 557 | |
| 558 // Returns the sum of all the state elements, being the total number of blobs. | |
| 559 unsigned TotalOfStates() const; | |
| 560 | |
| 561 void print() const { | |
| 562 this->print(""); | |
| 563 } | |
| 564 void print(const char *msg) const; | |
| 565 // Prints the segmentation state with an introductory message. | |
| 566 void print_state(const char *msg) const; | |
| 567 | |
| 568 // Displays the segmentation state of *this (if not the same as the last | |
| 569 // one displayed) and waits for a click in the window. | |
| 570 void DisplaySegmentation(TWERD *word); | |
| 571 | |
| 572 WERD_CHOICE &operator+=( // concatanate | |
| 573 const WERD_CHOICE &second); // second on first | |
| 574 | |
| 575 WERD_CHOICE &operator=(const WERD_CHOICE &source); | |
| 576 | |
| 577 private: | |
| 578 const UNICHARSET *unicharset_; | |
| 579 // TODO(rays) Perhaps replace the multiple arrays with an array of structs? | |
| 580 // unichar_ids_ is an array of classifier "results" that make up a word. | |
| 581 // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position | |
| 582 // of each unichar_id. | |
| 583 // state_[i] indicates the number of blobs in WERD_RES::chopped_word that | |
| 584 // were put together to make the classification results in the ith position | |
| 585 // in unichar_ids_, and certainties_[i] is the certainty of the choice that | |
| 586 // was used in this word. | |
| 587 // == Change from before == | |
| 588 // Previously there was fragment_lengths_ that allowed a word to be | |
| 589 // artificially composed of multiple fragment results. Since the new | |
| 590 // segmentation search doesn't do fragments, treatment of fragments has | |
| 591 // been moved to a lower level, augmenting the ratings matrix with the | |
| 592 // combined fragments, and allowing the language-model/segmentation-search | |
| 593 // to deal with only the combined unichar_ids. | |
| 594 std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word | |
| 595 std::vector<ScriptPos> script_pos_; // Normal/Sub/Superscript of each unichar. | |
| 596 std::vector<int> state_; // Number of blobs in each unichar. | |
| 597 std::vector<float> certainties_; // Certainty of each unichar. | |
| 598 unsigned reserved_; // size of the above arrays | |
| 599 unsigned length_; // word length | |
| 600 // Factor that was used to adjust the rating. | |
| 601 float adjust_factor_; | |
| 602 // Rating is the sum of the ratings of the individual blobs in the word. | |
| 603 float rating_; // size related | |
| 604 // certainty is the min (worst) certainty of the individual blobs in the word. | |
| 605 float certainty_; // absolute | |
| 606 // xheight computed from the result, or 0 if inconsistent. | |
| 607 float min_x_height_; | |
| 608 float max_x_height_; | |
| 609 uint8_t permuter_; // permuter code | |
| 610 | |
| 611 // Normally, the ratings_ matrix represents the recognition results in order | |
| 612 // from left-to-right. However, some engines (say Cube) may return | |
| 613 // recognition results in the order of the script's major reading direction | |
| 614 // (for Arabic, that is right-to-left). | |
| 615 bool unichars_in_script_order_; | |
| 616 // True if NoDangerousAmbig found an ambiguity. | |
| 617 bool dangerous_ambig_found_; | |
| 618 | |
| 619 // The following variables are populated and passed by reference any | |
| 620 // time unichar_string() or unichar_lengths() are called. | |
| 621 mutable std::string unichar_string_; | |
| 622 mutable std::string unichar_lengths_; | |
| 623 }; | |
| 624 | |
| 625 // Make WERD_CHOICE listable. | |
| 626 ELISTIZEH(WERD_CHOICE) | |
| 627 using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>; | |
| 628 | |
| 629 // Utilities for comparing WERD_CHOICEs | |
| 630 | |
| 631 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2); | |
| 632 | |
| 633 // Utilities for debug printing. | |
| 634 void print_ratings_list(const char *msg, // intro message | |
| 635 BLOB_CHOICE_LIST *ratings, // list of results | |
| 636 const UNICHARSET ¤t_unicharset // unicharset that can be used | |
| 637 // for id-to-unichar conversion | |
| 638 ); | |
| 639 | |
| 640 } // namespace tesseract | |
| 641 | |
| 642 #endif |
