Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: ratngs.cpp (Formerly ratings.c) | |
| 3 * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #ifdef HAVE_CONFIG_H | |
| 20 # include "config_auto.h" | |
| 21 #endif | |
| 22 | |
| 23 #include "ratngs.h" | |
| 24 | |
| 25 #include "blobs.h" | |
| 26 #include "matrix.h" | |
| 27 #include "normalis.h" // kBlnBaselineOffset. | |
| 28 #include "unicharset.h" | |
| 29 | |
| 30 #include <algorithm> | |
| 31 #include <cmath> | |
| 32 #include <string> | |
| 33 #include <vector> | |
| 34 | |
| 35 namespace tesseract { | |
| 36 | |
| 37 const float WERD_CHOICE::kBadRating = 100000.0; | |
| 38 // Min offset in baseline-normalized coords to make a character a subscript. | |
| 39 const int kMinSubscriptOffset = 20; | |
| 40 // Min offset in baseline-normalized coords to make a character a superscript. | |
| 41 const int kMinSuperscriptOffset = 20; | |
| 42 // Max y of bottom of a drop-cap blob. | |
| 43 const int kMaxDropCapBottom = -128; | |
| 44 // Max fraction of x-height to use as denominator in measuring x-height overlap. | |
| 45 const double kMaxOverlapDenominator = 0.125; | |
| 46 // Min fraction of x-height range that should be in agreement for matching | |
| 47 // x-heights. | |
| 48 const double kMinXHeightMatch = 0.5; | |
| 49 // Max tolerance on baseline position as a fraction of x-height for matching | |
| 50 // baselines. | |
| 51 const double kMaxBaselineDrift = 0.0625; | |
| 52 | |
| 53 static const char kPermuterTypeNoPerm[] = "None"; | |
| 54 static const char kPermuterTypePuncPerm[] = "Punctuation"; | |
| 55 static const char kPermuterTypeTopPerm[] = "Top Choice"; | |
| 56 static const char kPermuterTypeLowerPerm[] = "Top Lower Case"; | |
| 57 static const char kPermuterTypeUpperPerm[] = "Top Upper Case"; | |
| 58 static const char kPermuterTypeNgramPerm[] = "Ngram"; | |
| 59 static const char kPermuterTypeNumberPerm[] = "Number"; | |
| 60 static const char kPermuterTypeUserPatPerm[] = "User Pattern"; | |
| 61 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary"; | |
| 62 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary"; | |
| 63 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary"; | |
| 64 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary"; | |
| 65 static const char kPermuterTypeCompoundPerm[] = "Compound"; | |
| 66 | |
| 67 static const char *const kPermuterTypeNames[] = { | |
| 68 kPermuterTypeNoPerm, // 0 | |
| 69 kPermuterTypePuncPerm, // 1 | |
| 70 kPermuterTypeTopPerm, // 2 | |
| 71 kPermuterTypeLowerPerm, // 3 | |
| 72 kPermuterTypeUpperPerm, // 4 | |
| 73 kPermuterTypeNgramPerm, // 5 | |
| 74 kPermuterTypeNumberPerm, // 6 | |
| 75 kPermuterTypeUserPatPerm, // 7 | |
| 76 kPermuterTypeSysDawgPerm, // 8 | |
| 77 kPermuterTypeDocDawgPerm, // 9 | |
| 78 kPermuterTypeUserDawgPerm, // 10 | |
| 79 kPermuterTypeFreqDawgPerm, // 11 | |
| 80 kPermuterTypeCompoundPerm // 12 | |
| 81 }; | |
| 82 | |
| 83 /** | |
| 84 * BLOB_CHOICE::BLOB_CHOICE | |
| 85 * | |
| 86 * Constructor to build a BLOB_CHOICE from a char, rating and certainty. | |
| 87 */ | |
| 88 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id | |
| 89 float src_rating, // rating | |
| 90 float src_cert, // certainty | |
| 91 int src_script_id, // script | |
| 92 float min_xheight, // min xheight allowed | |
| 93 float max_xheight, // max xheight by this char | |
| 94 float yshift, // yshift out of position | |
| 95 BlobChoiceClassifier c) { // adapted match or other | |
| 96 unichar_id_ = src_unichar_id; | |
| 97 rating_ = src_rating; | |
| 98 certainty_ = src_cert; | |
| 99 fontinfo_id_ = -1; | |
| 100 fontinfo_id2_ = -1; | |
| 101 script_id_ = src_script_id; | |
| 102 min_xheight_ = min_xheight; | |
| 103 max_xheight_ = max_xheight; | |
| 104 yshift_ = yshift; | |
| 105 classifier_ = c; | |
| 106 } | |
| 107 | |
| 108 /** | |
| 109 * BLOB_CHOICE::BLOB_CHOICE | |
| 110 * | |
| 111 * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. | |
| 112 */ | |
| 113 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) { | |
| 114 unichar_id_ = other.unichar_id(); | |
| 115 rating_ = other.rating(); | |
| 116 certainty_ = other.certainty(); | |
| 117 fontinfo_id_ = other.fontinfo_id(); | |
| 118 fontinfo_id2_ = other.fontinfo_id2(); | |
| 119 script_id_ = other.script_id(); | |
| 120 matrix_cell_ = other.matrix_cell_; | |
| 121 min_xheight_ = other.min_xheight_; | |
| 122 max_xheight_ = other.max_xheight_; | |
| 123 yshift_ = other.yshift(); | |
| 124 classifier_ = other.classifier_; | |
| 125 #ifndef DISABLED_LEGACY_ENGINE | |
| 126 fonts_ = other.fonts_; | |
| 127 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 128 } | |
| 129 | |
| 130 // Copy assignment operator. | |
| 131 BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) { | |
| 132 ELIST_LINK::operator=(other); | |
| 133 unichar_id_ = other.unichar_id(); | |
| 134 rating_ = other.rating(); | |
| 135 certainty_ = other.certainty(); | |
| 136 fontinfo_id_ = other.fontinfo_id(); | |
| 137 fontinfo_id2_ = other.fontinfo_id2(); | |
| 138 script_id_ = other.script_id(); | |
| 139 matrix_cell_ = other.matrix_cell_; | |
| 140 min_xheight_ = other.min_xheight_; | |
| 141 max_xheight_ = other.max_xheight_; | |
| 142 yshift_ = other.yshift(); | |
| 143 classifier_ = other.classifier_; | |
| 144 #ifndef DISABLED_LEGACY_ENGINE | |
| 145 fonts_ = other.fonts_; | |
| 146 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 147 return *this; | |
| 148 } | |
| 149 | |
| 150 // Returns true if *this and other agree on the baseline and x-height | |
| 151 // to within some tolerance based on a given estimate of the x-height. | |
| 152 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const { | |
| 153 double baseline_diff = std::fabs(yshift() - other.yshift()); | |
| 154 if (baseline_diff > kMaxBaselineDrift * x_height) { | |
| 155 if (debug) { | |
| 156 tprintf("Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_); | |
| 157 } | |
| 158 return false; | |
| 159 } | |
| 160 double this_range = max_xheight() - min_xheight(); | |
| 161 double other_range = other.max_xheight() - other.min_xheight(); | |
| 162 double denominator = | |
| 163 ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height); | |
| 164 double overlap = | |
| 165 std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight()); | |
| 166 overlap /= denominator; | |
| 167 if (debug) { | |
| 168 tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_, | |
| 169 other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap); | |
| 170 } | |
| 171 | |
| 172 return overlap >= kMinXHeightMatch; | |
| 173 } | |
| 174 | |
| 175 // Helper to find the BLOB_CHOICE in the bc_list that matches the given | |
| 176 // unichar_id, or nullptr if there is no match. | |
| 177 BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) { | |
| 178 // Find the corresponding best BLOB_CHOICE. | |
| 179 BLOB_CHOICE_IT choice_it(bc_list); | |
| 180 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { | |
| 181 BLOB_CHOICE *choice = choice_it.data(); | |
| 182 if (choice->unichar_id() == char_id) { | |
| 183 return choice; | |
| 184 } | |
| 185 } | |
| 186 return nullptr; | |
| 187 } | |
| 188 | |
| 189 const char *WERD_CHOICE::permuter_name(uint8_t permuter) { | |
| 190 return kPermuterTypeNames[permuter]; | |
| 191 } | |
| 192 | |
| 193 const char *ScriptPosToString(enum ScriptPos script_pos) { | |
| 194 switch (script_pos) { | |
| 195 case SP_NORMAL: | |
| 196 return "NORM"; | |
| 197 case SP_SUBSCRIPT: | |
| 198 return "SUB"; | |
| 199 case SP_SUPERSCRIPT: | |
| 200 return "SUPER"; | |
| 201 case SP_DROPCAP: | |
| 202 return "DROPC"; | |
| 203 } | |
| 204 return "SP_UNKNOWN"; | |
| 205 } | |
| 206 | |
| 207 /** | |
| 208 * WERD_CHOICE::WERD_CHOICE | |
| 209 * | |
| 210 * Constructor to build a WERD_CHOICE from the given string. | |
| 211 * The function assumes that src_string is not nullptr. | |
| 212 */ | |
| 213 WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) | |
| 214 : unicharset_(&unicharset) { | |
| 215 std::vector<UNICHAR_ID> encoding; | |
| 216 std::vector<char> lengths; | |
| 217 std::string cleaned = unicharset.CleanupString(src_string); | |
| 218 if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) { | |
| 219 lengths.push_back('\0'); | |
| 220 std::string src_lengths = &lengths[0]; | |
| 221 this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM); | |
| 222 } else { // There must have been an invalid unichar in the string. | |
| 223 this->init(8); | |
| 224 this->make_bad(); | |
| 225 } | |
| 226 } | |
| 227 | |
| 228 /** | |
| 229 * WERD_CHOICE::init | |
| 230 * | |
| 231 * Helper function to build a WERD_CHOICE from the given string, | |
| 232 * fragment lengths, rating, certainty and permuter. | |
| 233 * | |
| 234 * The function assumes that src_string is not nullptr. | |
| 235 * src_lengths argument could be nullptr, in which case the unichars | |
| 236 * in src_string are assumed to all be of length 1. | |
| 237 */ | |
| 238 void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating, | |
| 239 float src_certainty, uint8_t src_permuter) { | |
| 240 int src_string_len = strlen(src_string); | |
| 241 if (src_string_len == 0) { | |
| 242 this->init(8); | |
| 243 } else { | |
| 244 this->init(src_lengths ? strlen(src_lengths) : src_string_len); | |
| 245 length_ = reserved_; | |
| 246 int offset = 0; | |
| 247 for (unsigned i = 0; i < length_; ++i) { | |
| 248 int unichar_length = src_lengths ? src_lengths[i] : 1; | |
| 249 unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length); | |
| 250 state_[i] = 1; | |
| 251 certainties_[i] = src_certainty; | |
| 252 offset += unichar_length; | |
| 253 } | |
| 254 } | |
| 255 adjust_factor_ = 1.0f; | |
| 256 rating_ = src_rating; | |
| 257 certainty_ = src_certainty; | |
| 258 permuter_ = src_permuter; | |
| 259 dangerous_ambig_found_ = false; | |
| 260 } | |
| 261 | |
| 262 /** | |
| 263 * WERD_CHOICE::~WERD_CHOICE | |
| 264 */ | |
| 265 WERD_CHOICE::~WERD_CHOICE() = default; | |
| 266 | |
| 267 const char *WERD_CHOICE::permuter_name() const { | |
| 268 return kPermuterTypeNames[permuter_]; | |
| 269 } | |
| 270 | |
| 271 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, | |
| 272 // taken from the appropriate cell in the ratings MATRIX. | |
| 273 // Borrowed pointer, so do not delete. | |
| 274 BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const { | |
| 275 MATRIX_COORD coord = MatrixCoord(index); | |
| 276 BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row); | |
| 277 if (result == nullptr) { | |
| 278 result = new BLOB_CHOICE_LIST; | |
| 279 ratings->put(coord.col, coord.row, result); | |
| 280 } | |
| 281 return result; | |
| 282 } | |
| 283 | |
| 284 // Returns the MATRIX_COORD corresponding to the location in the ratings | |
| 285 // MATRIX for the given index into the word. | |
| 286 MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const { | |
| 287 int col = 0; | |
| 288 for (unsigned i = 0; i < index; ++i) { | |
| 289 col += state_[i]; | |
| 290 } | |
| 291 int row = col + state_[index] - 1; | |
| 292 return MATRIX_COORD(col, row); | |
| 293 } | |
| 294 | |
| 295 // Sets the entries for the given index from the BLOB_CHOICE, assuming | |
| 296 // unit fragment lengths, but setting the state for this index to blob_count. | |
| 297 void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) { | |
| 298 unichar_ids_[index] = blob_choice->unichar_id(); | |
| 299 script_pos_[index] = tesseract::SP_NORMAL; | |
| 300 state_[index] = blob_count; | |
| 301 certainties_[index] = blob_choice->certainty(); | |
| 302 } | |
| 303 | |
| 304 /** | |
| 305 * contains_unichar_id | |
| 306 * | |
| 307 * Returns true if unichar_ids_ contain the given unichar_id, false otherwise. | |
| 308 */ | |
| 309 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const { | |
| 310 for (unsigned i = 0; i < length_; ++i) { | |
| 311 if (unichar_ids_[i] == unichar_id) { | |
| 312 return true; | |
| 313 } | |
| 314 } | |
| 315 return false; | |
| 316 } | |
| 317 | |
| 318 /** | |
| 319 * remove_unichar_ids | |
| 320 * | |
| 321 * Removes num unichar ids starting from index start from unichar_ids_ | |
| 322 * and updates length_ and fragment_lengths_ to reflect this change. | |
| 323 * Note: this function does not modify rating_ and certainty_. | |
| 324 */ | |
| 325 void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) { | |
| 326 ASSERT_HOST(start + num <= length_); | |
| 327 // Accumulate the states to account for the merged blobs. | |
| 328 for (int i = 0; i < num; ++i) { | |
| 329 if (start > 0) { | |
| 330 state_[start - 1] += state_[start + i]; | |
| 331 } else if (start + num < length_) { | |
| 332 state_[start + num] += state_[start + i]; | |
| 333 } | |
| 334 } | |
| 335 for (unsigned i = start; i + num < length_; ++i) { | |
| 336 unichar_ids_[i] = unichar_ids_[i + num]; | |
| 337 script_pos_[i] = script_pos_[i + num]; | |
| 338 state_[i] = state_[i + num]; | |
| 339 certainties_[i] = certainties_[i + num]; | |
| 340 } | |
| 341 length_ -= num; | |
| 342 } | |
| 343 | |
| 344 /** | |
| 345 * reverse_and_mirror_unichar_ids | |
| 346 * | |
| 347 * Reverses and mirrors unichars in unichar_ids. | |
| 348 */ | |
| 349 void WERD_CHOICE::reverse_and_mirror_unichar_ids() { | |
| 350 for (unsigned i = 0; i < length_ / 2; ++i) { | |
| 351 UNICHAR_ID tmp_id = unichar_ids_[i]; | |
| 352 unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]); | |
| 353 unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id); | |
| 354 } | |
| 355 if (length_ % 2 != 0) { | |
| 356 unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]); | |
| 357 } | |
| 358 } | |
| 359 | |
| 360 /** | |
| 361 * punct_stripped | |
| 362 * | |
| 363 * Returns the half-open interval of unichar_id indices [start, end) which | |
| 364 * enclose the core portion of this word -- the part after stripping | |
| 365 * punctuation from the left and right. | |
| 366 */ | |
| 367 void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const { | |
| 368 *start = 0; | |
| 369 *end = length(); | |
| 370 while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) { | |
| 371 (*start)++; | |
| 372 } | |
| 373 while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) { | |
| 374 (*end)--; | |
| 375 } | |
| 376 } | |
| 377 | |
| 378 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const { | |
| 379 int end = length(); | |
| 380 while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) && | |
| 381 BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { | |
| 382 end--; | |
| 383 } | |
| 384 int start = 0; | |
| 385 while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) && | |
| 386 BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { | |
| 387 start++; | |
| 388 } | |
| 389 *pstart = start; | |
| 390 *pend = end; | |
| 391 } | |
| 392 | |
| 393 WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const { | |
| 394 ASSERT_HOST(start <= length_); | |
| 395 ASSERT_HOST(end <= length_); | |
| 396 if (end < start) { | |
| 397 end = start; | |
| 398 } | |
| 399 WERD_CHOICE retval(unicharset_, end - start); | |
| 400 for (auto i = start; i < end; i++) { | |
| 401 retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]); | |
| 402 } | |
| 403 return retval; | |
| 404 } | |
| 405 | |
| 406 /** | |
| 407 * has_rtl_unichar_id | |
| 408 * | |
| 409 * Returns true if unichar_ids contain at least one "strongly" RTL unichar. | |
| 410 */ | |
| 411 bool WERD_CHOICE::has_rtl_unichar_id() const { | |
| 412 for (unsigned i = 0; i < length_; ++i) { | |
| 413 UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]); | |
| 414 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { | |
| 415 return true; | |
| 416 } | |
| 417 } | |
| 418 return false; | |
| 419 } | |
| 420 | |
| 421 /** | |
| 422 * string_and_lengths | |
| 423 * | |
| 424 * Populates the given word_str with unichars from unichar_ids and | |
| 425 * and word_lengths_str with the corresponding unichar lengths. | |
| 426 */ | |
| 427 void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const { | |
| 428 *word_str = ""; | |
| 429 if (word_lengths_str != nullptr) { | |
| 430 *word_lengths_str = ""; | |
| 431 } | |
| 432 for (unsigned i = 0; i < length_; ++i) { | |
| 433 const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]); | |
| 434 *word_str += ch; | |
| 435 if (word_lengths_str != nullptr) { | |
| 436 *word_lengths_str += (char)strlen(ch); | |
| 437 } | |
| 438 } | |
| 439 } | |
| 440 | |
| 441 /** | |
| 442 * append_unichar_id | |
| 443 * | |
| 444 * Make sure there is enough space in the word for the new unichar id | |
| 445 * and call append_unichar_id_space_allocated(). | |
| 446 */ | |
| 447 void WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, | |
| 448 float certainty) { | |
| 449 if (length_ == reserved_) { | |
| 450 this->double_the_size(); | |
| 451 } | |
| 452 this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty); | |
| 453 } | |
| 454 | |
| 455 /** | |
| 456 * WERD_CHOICE::operator+= | |
| 457 * | |
| 458 * Cat a second word rating on the end of this current one. | |
| 459 * The ratings are added and the confidence is the min. | |
| 460 * If the permuters are NOT the same the permuter is set to COMPOUND_PERM | |
| 461 */ | |
| 462 WERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) { | |
| 463 ASSERT_HOST(unicharset_ == second.unicharset_); | |
| 464 while (reserved_ < length_ + second.length()) { | |
| 465 this->double_the_size(); | |
| 466 } | |
| 467 const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids(); | |
| 468 for (unsigned i = 0; i < second.length(); ++i) { | |
| 469 unichar_ids_[length_ + i] = other_unichar_ids[i]; | |
| 470 state_[length_ + i] = second.state_[i]; | |
| 471 certainties_[length_ + i] = second.certainties_[i]; | |
| 472 script_pos_[length_ + i] = second.BlobPosition(i); | |
| 473 } | |
| 474 length_ += second.length(); | |
| 475 if (second.adjust_factor_ > adjust_factor_) { | |
| 476 adjust_factor_ = second.adjust_factor_; | |
| 477 } | |
| 478 rating_ += second.rating(); // add ratings | |
| 479 if (second.certainty() < certainty_) { // take min | |
| 480 certainty_ = second.certainty(); | |
| 481 } | |
| 482 if (second.dangerous_ambig_found_) { | |
| 483 dangerous_ambig_found_ = true; | |
| 484 } | |
| 485 if (permuter_ == NO_PERM) { | |
| 486 permuter_ = second.permuter(); | |
| 487 } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) { | |
| 488 permuter_ = COMPOUND_PERM; | |
| 489 } | |
| 490 return *this; | |
| 491 } | |
| 492 | |
| 493 /** | |
| 494 * WERD_CHOICE::operator= | |
| 495 * | |
| 496 * Allocate enough memory to hold a copy of source and copy over | |
| 497 * all the information from source to this WERD_CHOICE. | |
| 498 */ | |
| 499 WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) { | |
| 500 while (reserved_ < source.length()) { | |
| 501 this->double_the_size(); | |
| 502 } | |
| 503 | |
| 504 unicharset_ = source.unicharset_; | |
| 505 const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids(); | |
| 506 for (unsigned i = 0; i < source.length(); ++i) { | |
| 507 unichar_ids_[i] = other_unichar_ids[i]; | |
| 508 state_[i] = source.state_[i]; | |
| 509 certainties_[i] = source.certainties_[i]; | |
| 510 script_pos_[i] = source.BlobPosition(i); | |
| 511 } | |
| 512 length_ = source.length(); | |
| 513 adjust_factor_ = source.adjust_factor_; | |
| 514 rating_ = source.rating(); | |
| 515 certainty_ = source.certainty(); | |
| 516 min_x_height_ = source.min_x_height(); | |
| 517 max_x_height_ = source.max_x_height(); | |
| 518 permuter_ = source.permuter(); | |
| 519 dangerous_ambig_found_ = source.dangerous_ambig_found_; | |
| 520 return *this; | |
| 521 } | |
| 522 | |
| 523 // Sets up the script_pos_ member using the blobs_list to get the bln | |
| 524 // bounding boxes, *this to get the unichars, and this->unicharset | |
| 525 // to get the target positions. If small_caps is true, sub/super are not | |
| 526 // considered, but dropcaps are. | |
| 527 // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.) | |
| 528 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) { | |
| 529 // Initialize to normal. | |
| 530 for (unsigned i = 0; i < length_; ++i) { | |
| 531 script_pos_[i] = tesseract::SP_NORMAL; | |
| 532 } | |
| 533 if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) { | |
| 534 return; | |
| 535 } | |
| 536 | |
| 537 unsigned position_counts[4] = {0, 0, 0, 0}; | |
| 538 | |
| 539 int chunk_index = 0; | |
| 540 for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) { | |
| 541 TBLOB *tblob = word->blobs[chunk_index]; | |
| 542 int uni_id = unichar_id(blob_index); | |
| 543 TBOX blob_box = tblob->bounding_box(); | |
| 544 if (!state_.empty()) { | |
| 545 for (int i = 1; i < state_[blob_index]; ++i) { | |
| 546 ++chunk_index; | |
| 547 tblob = word->blobs[chunk_index]; | |
| 548 blob_box += tblob->bounding_box(); | |
| 549 } | |
| 550 } | |
| 551 script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id); | |
| 552 if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) { | |
| 553 script_pos_[blob_index] = tesseract::SP_NORMAL; | |
| 554 } | |
| 555 position_counts[script_pos_[blob_index]]++; | |
| 556 } | |
| 557 // If almost everything looks like a superscript or subscript, | |
| 558 // we most likely just got the baseline wrong. | |
| 559 if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ || | |
| 560 4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) { | |
| 561 if (debug >= 2) { | |
| 562 tprintf( | |
| 563 "Most characters of %s are subscript or superscript.\n" | |
| 564 "That seems wrong, so I'll assume we got the baseline wrong\n", | |
| 565 unichar_string().c_str()); | |
| 566 } | |
| 567 for (unsigned i = 0; i < length_; i++) { | |
| 568 ScriptPos sp = script_pos_[i]; | |
| 569 if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) { | |
| 570 ASSERT_HOST(position_counts[sp] > 0); | |
| 571 position_counts[sp]--; | |
| 572 position_counts[tesseract::SP_NORMAL]++; | |
| 573 script_pos_[i] = tesseract::SP_NORMAL; | |
| 574 } | |
| 575 } | |
| 576 } | |
| 577 | |
| 578 if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) { | |
| 579 tprintf("SetScriptPosition on %s\n", unichar_string().c_str()); | |
| 580 int chunk_index = 0; | |
| 581 for (unsigned blob_index = 0; blob_index < length_; ++blob_index) { | |
| 582 if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) { | |
| 583 TBLOB *tblob = word->blobs[chunk_index]; | |
| 584 ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index)); | |
| 585 } | |
| 586 chunk_index += state_.empty() ? 1 : state_[blob_index]; | |
| 587 } | |
| 588 } | |
| 589 } | |
| 590 | |
| 591 // Sets all the script_pos_ positions to the given position. | |
| 592 void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) { | |
| 593 for (unsigned i = 0; i < length_; ++i) { | |
| 594 script_pos_[i] = position; | |
| 595 } | |
| 596 } | |
| 597 | |
| 598 /* static */ | |
| 599 ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, | |
| 600 const TBOX &blob_box, UNICHAR_ID unichar_id) { | |
| 601 ScriptPos retval = tesseract::SP_NORMAL; | |
| 602 int top = blob_box.top(); | |
| 603 int bottom = blob_box.bottom(); | |
| 604 int min_bottom, max_bottom, min_top, max_top; | |
| 605 unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top); | |
| 606 | |
| 607 int sub_thresh_top = min_top - kMinSubscriptOffset; | |
| 608 int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; | |
| 609 int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; | |
| 610 if (bottom <= kMaxDropCapBottom) { | |
| 611 retval = tesseract::SP_DROPCAP; | |
| 612 } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { | |
| 613 retval = tesseract::SP_SUBSCRIPT; | |
| 614 } else if (bottom > sup_thresh_bot) { | |
| 615 retval = tesseract::SP_SUPERSCRIPT; | |
| 616 } | |
| 617 | |
| 618 if (print_debug) { | |
| 619 const char *pos = ScriptPosToString(retval); | |
| 620 tprintf( | |
| 621 "%s Character %s[bot:%d top: %d] " | |
| 622 "bot_range[%d,%d] top_range[%d, %d] " | |
| 623 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", | |
| 624 pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top, | |
| 625 max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot); | |
| 626 } | |
| 627 return retval; | |
| 628 } | |
| 629 | |
| 630 // Returns the script-id (eg Han) of the dominant script in the word. | |
| 631 int WERD_CHOICE::GetTopScriptID() const { | |
| 632 unsigned max_script = unicharset_->get_script_table_size(); | |
| 633 std::vector<unsigned> sid(max_script); | |
| 634 for (unsigned x = 0; x < length_; ++x) { | |
| 635 int script_id = unicharset_->get_script(unichar_id(x)); | |
| 636 sid[script_id]++; | |
| 637 } | |
| 638 if (unicharset_->han_sid() != unicharset_->null_sid()) { | |
| 639 // Add the Hiragana & Katakana counts to Han and zero them out. | |
| 640 if (unicharset_->hiragana_sid() != unicharset_->null_sid()) { | |
| 641 sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()]; | |
| 642 sid[unicharset_->hiragana_sid()] = 0; | |
| 643 } | |
| 644 if (unicharset_->katakana_sid() != unicharset_->null_sid()) { | |
| 645 sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()]; | |
| 646 sid[unicharset_->katakana_sid()] = 0; | |
| 647 } | |
| 648 } | |
| 649 // Note that high script ID overrides lower one on a tie, thus biasing | |
| 650 // towards non-Common script (if sorted that way in unicharset file). | |
| 651 unsigned max_sid = 0; | |
| 652 for (unsigned x = 1; x < max_script; x++) { | |
| 653 if (sid[x] >= sid[max_sid]) { | |
| 654 max_sid = x; | |
| 655 } | |
| 656 } | |
| 657 if (sid[max_sid] < length_ / 2) { | |
| 658 max_sid = unicharset_->null_sid(); | |
| 659 } | |
| 660 return max_sid; | |
| 661 } | |
| 662 | |
| 663 // Fixes the state_ for a chop at the given blob_posiiton. | |
| 664 void WERD_CHOICE::UpdateStateForSplit(int blob_position) { | |
| 665 int total_chunks = 0; | |
| 666 for (unsigned i = 0; i < length_; ++i) { | |
| 667 total_chunks += state_[i]; | |
| 668 if (total_chunks > blob_position) { | |
| 669 ++state_[i]; | |
| 670 return; | |
| 671 } | |
| 672 } | |
| 673 } | |
| 674 | |
| 675 // Returns the sum of all the state elements, being the total number of blobs. | |
| 676 unsigned WERD_CHOICE::TotalOfStates() const { | |
| 677 unsigned total_chunks = 0; | |
| 678 for (unsigned i = 0; i < length_; ++i) { | |
| 679 total_chunks += state_[i]; | |
| 680 } | |
| 681 return total_chunks; | |
| 682 } | |
| 683 | |
| 684 /** | |
| 685 * WERD_CHOICE::print | |
| 686 * | |
| 687 * Print WERD_CHOICE to stdout. | |
| 688 */ | |
| 689 void WERD_CHOICE::print(const char *msg) const { | |
| 690 tprintf("%s : ", msg); | |
| 691 for (unsigned i = 0; i < length_; ++i) { | |
| 692 tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i])); | |
| 693 } | |
| 694 tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_, | |
| 695 adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_); | |
| 696 tprintf("pos"); | |
| 697 for (unsigned i = 0; i < length_; ++i) { | |
| 698 tprintf("\t%s", ScriptPosToString(script_pos_[i])); | |
| 699 } | |
| 700 tprintf("\nstr"); | |
| 701 for (unsigned i = 0; i < length_; ++i) { | |
| 702 tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i])); | |
| 703 } | |
| 704 tprintf("\nstate:"); | |
| 705 for (unsigned i = 0; i < length_; ++i) { | |
| 706 tprintf("\t%d ", state_[i]); | |
| 707 } | |
| 708 tprintf("\nC"); | |
| 709 for (unsigned i = 0; i < length_; ++i) { | |
| 710 tprintf("\t%.3f", certainties_[i]); | |
| 711 } | |
| 712 tprintf("\n"); | |
| 713 } | |
| 714 | |
| 715 // Prints the segmentation state with an introductory message. | |
| 716 void WERD_CHOICE::print_state(const char *msg) const { | |
| 717 tprintf("%s", msg); | |
| 718 for (unsigned i = 0; i < length_; ++i) { | |
| 719 tprintf(" %d", state_[i]); | |
| 720 } | |
| 721 tprintf("\n"); | |
| 722 } | |
| 723 | |
| 724 #ifndef GRAPHICS_DISABLED | |
| 725 | |
| 726 // Displays the segmentation state of *this (if not the same as the last | |
| 727 // one displayed) and waits for a click in the window. | |
| 728 void WERD_CHOICE::DisplaySegmentation(TWERD *word) { | |
| 729 // Number of different colors to draw with. | |
| 730 const int kNumColors = 6; | |
| 731 static ScrollView *segm_window = nullptr; | |
| 732 // Check the state against the static prev_drawn_state. | |
| 733 static std::vector<int> prev_drawn_state; | |
| 734 bool already_done = prev_drawn_state.size() == length_; | |
| 735 if (!already_done) { | |
| 736 prev_drawn_state.clear(); | |
| 737 prev_drawn_state.resize(length_); | |
| 738 } | |
| 739 for (unsigned i = 0; i < length_; ++i) { | |
| 740 if (prev_drawn_state[i] != state_[i]) { | |
| 741 already_done = false; | |
| 742 } | |
| 743 prev_drawn_state[i] = state_[i]; | |
| 744 } | |
| 745 if (already_done || word->blobs.empty()) { | |
| 746 return; | |
| 747 } | |
| 748 | |
| 749 // Create the window if needed. | |
| 750 if (segm_window == nullptr) { | |
| 751 segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true); | |
| 752 } else { | |
| 753 segm_window->Clear(); | |
| 754 } | |
| 755 | |
| 756 TBOX bbox; | |
| 757 int blob_index = 0; | |
| 758 for (unsigned c = 0; c < length_; ++c) { | |
| 759 auto color = static_cast<ScrollView::Color>(c % kNumColors + 3); | |
| 760 for (int i = 0; i < state_[c]; ++i, ++blob_index) { | |
| 761 TBLOB *blob = word->blobs[blob_index]; | |
| 762 bbox += blob->bounding_box(); | |
| 763 blob->plot(segm_window, color, color); | |
| 764 } | |
| 765 } | |
| 766 segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom()); | |
| 767 segm_window->Update(); | |
| 768 segm_window->Wait(); | |
| 769 } | |
| 770 | |
| 771 #endif // !GRAPHICS_DISABLED | |
| 772 | |
| 773 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) { | |
| 774 const UNICHARSET *uchset = word1.unicharset(); | |
| 775 if (word2.unicharset() != uchset) { | |
| 776 return false; | |
| 777 } | |
| 778 unsigned w1start, w1end; | |
| 779 word1.punct_stripped(&w1start, &w1end); | |
| 780 unsigned w2start, w2end; | |
| 781 word2.punct_stripped(&w2start, &w2end); | |
| 782 if (w1end - w1start != w2end - w2start) { | |
| 783 return false; | |
| 784 } | |
| 785 for (unsigned i = 0; i < w1end - w1start; i++) { | |
| 786 if (uchset->to_lower(word1.unichar_id(w1start + i)) != | |
| 787 uchset->to_lower(word2.unichar_id(w2start + i))) { | |
| 788 return false; | |
| 789 } | |
| 790 } | |
| 791 return true; | |
| 792 } | |
| 793 | |
| 794 /** | |
| 795 * print_ratings_list | |
| 796 * | |
| 797 * Send all the ratings out to the logfile. | |
| 798 * | |
| 799 * @param msg intro message | |
| 800 * @param ratings list of ratings | |
| 801 * @param current_unicharset unicharset that can be used | |
| 802 * for id-to-unichar conversion | |
| 803 */ | |
| 804 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, | |
| 805 const UNICHARSET ¤t_unicharset) { | |
| 806 if (ratings->empty()) { | |
| 807 tprintf("%s:<none>\n", msg); | |
| 808 return; | |
| 809 } | |
| 810 if (*msg != '\0') { | |
| 811 tprintf("%s\n", msg); | |
| 812 } | |
| 813 BLOB_CHOICE_IT c_it; | |
| 814 c_it.set_to_list(ratings); | |
| 815 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { | |
| 816 c_it.data()->print(¤t_unicharset); | |
| 817 if (!c_it.at_last()) { | |
| 818 tprintf("\n"); | |
| 819 } | |
| 820 } | |
| 821 tprintf("\n"); | |
| 822 fflush(stdout); | |
| 823 } | |
| 824 | |
| 825 } // namespace tesseract |
