Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: unicharset.cpp | |
| 3 // Description: Unicode character/ligature set class. | |
| 4 // Author: Thomas Kielbus | |
| 5 // | |
| 6 // (C) Copyright 2006, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "unicharset.h" | |
| 20 | |
| 21 #include "params.h" | |
| 22 | |
| 23 #include <tesseract/unichar.h> | |
| 24 #include "serialis.h" | |
| 25 | |
| 26 #include <algorithm> | |
| 27 #include <cassert> | |
| 28 #include <cstdio> | |
| 29 #include <cstring> | |
| 30 #include <iomanip> // for std::setw | |
| 31 #include <locale> // for std::locale::classic | |
| 32 #include <sstream> // for std::istringstream, std::ostringstream | |
| 33 | |
| 34 namespace tesseract { | |
| 35 | |
| 36 // Special character used in representing character fragments. | |
| 37 static const char kSeparator = '|'; | |
| 38 // Special character used in representing 'natural' character fragments. | |
| 39 static const char kNaturalFlag = 'n'; | |
| 40 | |
| 41 static const int ISALPHA_MASK = 0x1; | |
| 42 static const int ISLOWER_MASK = 0x2; | |
| 43 static const int ISUPPER_MASK = 0x4; | |
| 44 static const int ISDIGIT_MASK = 0x8; | |
| 45 static const int ISPUNCTUATION_MASK = 0x10; | |
| 46 | |
| 47 // Y coordinate threshold for determining cap-height vs x-height. | |
| 48 // TODO(rays) Bring the global definition down to the ccutil library level, | |
| 49 // so this constant is relative to some other constants. | |
| 50 static const int kMeanlineThreshold = 220; | |
| 51 // Let C be the number of alpha chars for which all tops exceed | |
| 52 // kMeanlineThreshold, and X the number of alpha chars for which all | |
| 53 // tops are below kMeanlineThreshold, then if X > C * | |
| 54 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than | |
| 55 // half the alpha characters have upper or lower case, then the | |
| 56 // unicharset "has x-height". | |
| 57 const double kMinXHeightFraction = 0.25; | |
| 58 const double kMinCapHeightFraction = 0.05; | |
| 59 | |
| 60 /*static */ | |
| 61 const char *UNICHARSET::kCustomLigatures[][2] = { | |
| 62 {"ct", "\uE003"}, // c + t -> U+E003 | |
| 63 {"ſh", "\uE006"}, // long-s + h -> U+E006 | |
| 64 {"ſi", "\uE007"}, // long-s + i -> U+E007 | |
| 65 {"ſl", "\uE008"}, // long-s + l -> U+E008 | |
| 66 {"ſſ", "\uE009"}, // long-s + long-s -> U+E009 | |
| 67 {nullptr, nullptr}}; | |
| 68 | |
| 69 // List of mappings to make when ingesting strings from the outside. | |
| 70 // The substitutions clean up text that should exist for rendering of | |
| 71 // synthetic data, but not in the recognition set. | |
| 72 const char *UNICHARSET::kCleanupMaps[][2] = { | |
| 73 {"\u0640", ""}, // TATWEEL is deleted. | |
| 74 {"\ufb01", "fi"}, // fi ligature->fi pair. | |
| 75 {"\ufb02", "fl"}, // fl ligature->fl pair. | |
| 76 {nullptr, nullptr}}; | |
| 77 | |
| 78 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. | |
| 79 const char *UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = { | |
| 80 " ", "Joined", "|Broken|0|1"}; | |
| 81 | |
| 82 const char *UNICHARSET::null_script = "NULL"; | |
| 83 | |
| 84 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() { | |
| 85 Init(); | |
| 86 } | |
| 87 | |
| 88 // Initialize all properties to sensible default values. | |
| 89 void UNICHARSET::UNICHAR_PROPERTIES::Init() { | |
| 90 isalpha = false; | |
| 91 islower = false; | |
| 92 isupper = false; | |
| 93 isdigit = false; | |
| 94 ispunctuation = false; | |
| 95 isngram = false; | |
| 96 enabled = false; | |
| 97 SetRangesOpen(); | |
| 98 script_id = 0; | |
| 99 other_case = 0; | |
| 100 mirror = 0; | |
| 101 normed = ""; | |
| 102 direction = UNICHARSET::U_LEFT_TO_RIGHT; | |
| 103 fragment = nullptr; | |
| 104 } | |
| 105 | |
| 106 // Sets all ranges wide open. Initialization default in case there are | |
| 107 // no useful values available. | |
| 108 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() { | |
| 109 min_bottom = 0; | |
| 110 max_bottom = UINT8_MAX; | |
| 111 min_top = 0; | |
| 112 max_top = UINT8_MAX; | |
| 113 width = 0.0f; | |
| 114 width_sd = 0.0f; | |
| 115 bearing = 0.0f; | |
| 116 bearing_sd = 0.0f; | |
| 117 advance = 0.0f; | |
| 118 advance_sd = 0.0f; | |
| 119 } | |
| 120 | |
| 121 // Sets all ranges to empty. Used before expanding with font-based data. | |
| 122 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() { | |
| 123 min_bottom = UINT8_MAX; | |
| 124 max_bottom = 0; | |
| 125 min_top = UINT8_MAX; | |
| 126 max_top = 0; | |
| 127 width = 0.0f; | |
| 128 width_sd = 0.0f; | |
| 129 bearing = 0.0f; | |
| 130 bearing_sd = 0.0f; | |
| 131 advance = 0.0f; | |
| 132 advance_sd = 0.0f; | |
| 133 } | |
| 134 | |
| 135 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats | |
| 136 // is empty. | |
| 137 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const { | |
| 138 return width == 0.0f || advance == 0.0f; | |
| 139 } | |
| 140 | |
| 141 // Expands the ranges with the ranges from the src properties. | |
| 142 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom( | |
| 143 const UNICHAR_PROPERTIES &src) { | |
| 144 UpdateRange(src.min_bottom, &min_bottom, &max_bottom); | |
| 145 UpdateRange(src.max_bottom, &min_bottom, &max_bottom); | |
| 146 UpdateRange(src.min_top, &min_top, &max_top); | |
| 147 UpdateRange(src.max_top, &min_top, &max_top); | |
| 148 if (src.width_sd > width_sd) { | |
| 149 width = src.width; | |
| 150 width_sd = src.width_sd; | |
| 151 } | |
| 152 if (src.bearing_sd > bearing_sd) { | |
| 153 bearing = src.bearing; | |
| 154 bearing_sd = src.bearing_sd; | |
| 155 } | |
| 156 if (src.advance_sd > advance_sd) { | |
| 157 advance = src.advance; | |
| 158 advance_sd = src.advance_sd; | |
| 159 } | |
| 160 } | |
| 161 | |
| 162 // Copies the properties from src into this. | |
| 163 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) { | |
| 164 // Apart from the fragment, everything else can be done with a default copy. | |
| 165 CHAR_FRAGMENT *saved_fragment = fragment; | |
| 166 *this = src; // Bitwise copy. | |
| 167 fragment = saved_fragment; | |
| 168 } | |
| 169 | |
| 170 UNICHARSET::UNICHARSET() | |
| 171 : ids(), script_table(nullptr), script_table_size_used(0) { | |
| 172 clear(); | |
| 173 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) { | |
| 174 unichar_insert(kSpecialUnicharCodes[i]); | |
| 175 if (i == UNICHAR_JOINED) { | |
| 176 set_isngram(i, true); | |
| 177 } | |
| 178 } | |
| 179 } | |
| 180 | |
| 181 UNICHARSET::~UNICHARSET() { | |
| 182 clear(); | |
| 183 } | |
| 184 | |
| 185 UNICHAR_ID | |
| 186 UNICHARSET::unichar_to_id(const char *const unichar_repr) const { | |
| 187 std::string cleaned = | |
| 188 old_style_included_ ? unichar_repr : CleanupString(unichar_repr); | |
| 189 return ids.contains(cleaned.data(), cleaned.size()) | |
| 190 ? ids.unichar_to_id(cleaned.data(), cleaned.size()) | |
| 191 : INVALID_UNICHAR_ID; | |
| 192 } | |
| 193 | |
| 194 UNICHAR_ID UNICHARSET::unichar_to_id(const char *const unichar_repr, | |
| 195 int length) const { | |
| 196 assert(length > 0 && length <= UNICHAR_LEN); | |
| 197 std::string cleaned(unichar_repr, length); | |
| 198 if (!old_style_included_) { | |
| 199 cleaned = CleanupString(unichar_repr, length); | |
| 200 } | |
| 201 return ids.contains(cleaned.data(), cleaned.size()) | |
| 202 ? ids.unichar_to_id(cleaned.data(), cleaned.size()) | |
| 203 : INVALID_UNICHAR_ID; | |
| 204 } | |
| 205 | |
| 206 // Return the minimum number of bytes that matches a legal UNICHAR_ID, | |
| 207 // while leaving the rest of the string encodable. Returns 0 if the | |
| 208 // beginning of the string is not encodable. | |
| 209 // WARNING: this function now encodes the whole string for precision. | |
| 210 // Use encode_string in preference to repeatedly calling step. | |
| 211 int UNICHARSET::step(const char *str) const { | |
| 212 std::vector<UNICHAR_ID> encoding; | |
| 213 std::vector<char> lengths; | |
| 214 encode_string(str, true, &encoding, &lengths, nullptr); | |
| 215 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) { | |
| 216 return 0; | |
| 217 } | |
| 218 return lengths[0]; | |
| 219 } | |
| 220 | |
| 221 // Return whether the given UTF-8 string is encodable with this UNICHARSET. | |
| 222 // If not encodable, write the first byte offset which cannot be converted | |
| 223 // into the second (return) argument. | |
| 224 bool UNICHARSET::encodable_string(const char *str, | |
| 225 unsigned *first_bad_position) const { | |
| 226 std::vector<UNICHAR_ID> encoding; | |
| 227 return encode_string(str, true, &encoding, nullptr, first_bad_position); | |
| 228 } | |
| 229 | |
| 230 // Encodes the given UTF-8 string with this UNICHARSET. | |
| 231 // Returns true if the encoding succeeds completely, false if there is at | |
| 232 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case | |
| 233 // the rest of the string is still encoded. | |
| 234 // If lengths is not nullptr, then it is filled with the corresponding | |
| 235 // byte length of each encoded UNICHAR_ID. | |
| 236 // WARNING: Caller must guarantee that str has already been cleaned of codes | |
| 237 // that do not belong in the unicharset, or encoding may fail. | |
| 238 // Use CleanupString to perform the cleaning. | |
| 239 bool UNICHARSET::encode_string(const char *str, bool give_up_on_failure, | |
| 240 std::vector<UNICHAR_ID> *encoding, | |
| 241 std::vector<char> *lengths, | |
| 242 unsigned *encoded_length) const { | |
| 243 std::vector<UNICHAR_ID> working_encoding; | |
| 244 std::vector<char> working_lengths; | |
| 245 std::vector<char> best_lengths; | |
| 246 encoding->clear(); // Just in case str is empty. | |
| 247 auto str_length = strlen(str); | |
| 248 unsigned str_pos = 0; | |
| 249 bool perfect = true; | |
| 250 while (str_pos < str_length) { | |
| 251 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths, | |
| 252 &str_pos, encoding, &best_lengths); | |
| 253 if (str_pos < str_length) { | |
| 254 // This is a non-match. Skip one utf-8 character. | |
| 255 perfect = false; | |
| 256 if (give_up_on_failure) { | |
| 257 break; | |
| 258 } | |
| 259 int step = UNICHAR::utf8_step(str + str_pos); | |
| 260 if (step == 0) { | |
| 261 step = 1; | |
| 262 } | |
| 263 encoding->push_back(INVALID_UNICHAR_ID); | |
| 264 best_lengths.push_back(step); | |
| 265 str_pos += step; | |
| 266 working_encoding = *encoding; | |
| 267 working_lengths = best_lengths; | |
| 268 } | |
| 269 } | |
| 270 if (lengths != nullptr) { | |
| 271 *lengths = std::move(best_lengths); | |
| 272 } | |
| 273 if (encoded_length != nullptr) { | |
| 274 *encoded_length = str_pos; | |
| 275 } | |
| 276 return perfect; | |
| 277 } | |
| 278 | |
| 279 const char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const { | |
| 280 if (id == INVALID_UNICHAR_ID) { | |
| 281 return INVALID_UNICHAR; | |
| 282 } | |
| 283 ASSERT_HOST(static_cast<unsigned>(id) < this->size()); | |
| 284 return unichars[id].representation; | |
| 285 } | |
| 286 | |
| 287 const char *UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const { | |
| 288 if (id == INVALID_UNICHAR_ID) { | |
| 289 return INVALID_UNICHAR; | |
| 290 } | |
| 291 ASSERT_HOST(static_cast<unsigned>(id) < this->size()); | |
| 292 // Resolve from the kCustomLigatures table if this is a private encoding. | |
| 293 if (get_isprivate(id)) { | |
| 294 const char *ch = id_to_unichar(id); | |
| 295 for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) { | |
| 296 if (!strcmp(ch, kCustomLigatures[i][1])) { | |
| 297 return kCustomLigatures[i][0]; | |
| 298 } | |
| 299 } | |
| 300 } | |
| 301 // Otherwise return the stored representation. | |
| 302 return unichars[id].representation; | |
| 303 } | |
| 304 | |
| 305 // Return a string that reformats the utf8 str into the str followed | |
| 306 // by its hex unicodes. | |
| 307 std::string UNICHARSET::debug_utf8_str(const char *str) { | |
| 308 std::string result = str; | |
| 309 result += " ["; | |
| 310 int step = 1; | |
| 311 // Chop into unicodes and code each as hex. | |
| 312 for (int i = 0; str[i] != '\0'; i += step) { | |
| 313 char hex[sizeof(int) * 2 + 1]; | |
| 314 step = UNICHAR::utf8_step(str + i); | |
| 315 if (step == 0) { | |
| 316 step = 1; | |
| 317 snprintf(hex, sizeof(hex), "%x", str[i]); | |
| 318 } else { | |
| 319 UNICHAR ch(str + i, step); | |
| 320 snprintf(hex, sizeof(hex), "%x", ch.first_uni()); | |
| 321 } | |
| 322 result += hex; | |
| 323 result += " "; | |
| 324 } | |
| 325 result += "]"; | |
| 326 return result; | |
| 327 } | |
| 328 | |
| 329 // Return a string containing debug information on the unichar, including | |
| 330 // the id_to_unichar, its hex unicodes and the properties. | |
| 331 std::string UNICHARSET::debug_str(UNICHAR_ID id) const { | |
| 332 if (id == INVALID_UNICHAR_ID) { | |
| 333 return std::string(id_to_unichar(id)); | |
| 334 } | |
| 335 const CHAR_FRAGMENT *fragment = this->get_fragment(id); | |
| 336 if (fragment) { | |
| 337 return fragment->to_string(); | |
| 338 } | |
| 339 const char *str = id_to_unichar(id); | |
| 340 std::string result = debug_utf8_str(str); | |
| 341 // Append a for lower alpha, A for upper alpha, and x if alpha but neither. | |
| 342 if (get_isalpha(id)) { | |
| 343 if (get_islower(id)) { | |
| 344 result += "a"; | |
| 345 } else if (get_isupper(id)) { | |
| 346 result += "A"; | |
| 347 } else { | |
| 348 result += "x"; | |
| 349 } | |
| 350 } | |
| 351 // Append 0 if a digit. | |
| 352 if (get_isdigit(id)) { | |
| 353 result += "0"; | |
| 354 } | |
| 355 // Append p is a punctuation symbol. | |
| 356 if (get_ispunctuation(id)) { | |
| 357 result += "p"; | |
| 358 } | |
| 359 return result; | |
| 360 } | |
| 361 | |
| 362 // Sets the normed_ids vector from the normed string. normed_ids is not | |
| 363 // stored in the file, and needs to be set when the UNICHARSET is loaded. | |
| 364 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) { | |
| 365 unichars[unichar_id].properties.normed_ids.clear(); | |
| 366 if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') { | |
| 367 unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE); | |
| 368 } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(), | |
| 369 true, &unichars[unichar_id].properties.normed_ids, | |
| 370 nullptr, nullptr)) { | |
| 371 unichars[unichar_id].properties.normed_ids.clear(); | |
| 372 unichars[unichar_id].properties.normed_ids.push_back(unichar_id); | |
| 373 } | |
| 374 } | |
| 375 | |
| 376 // Returns whether the unichar id represents a unicode value in the private use | |
| 377 // area. We use this range only internally to represent uncommon ligatures | |
| 378 // (eg. 'ct') that do not have regular unicode values. | |
| 379 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const { | |
| 380 UNICHAR uc(id_to_unichar(unichar_id), -1); | |
| 381 int uni = uc.first_uni(); | |
| 382 return (uni >= 0xE000 && uni <= 0xF8FF); | |
| 383 } | |
| 384 | |
| 385 // Sets all ranges to empty, so they can be expanded to set the values. | |
| 386 void UNICHARSET::set_ranges_empty() { | |
| 387 for (auto &uc : unichars) { | |
| 388 uc.properties.SetRangesEmpty(); | |
| 389 } | |
| 390 } | |
| 391 | |
| 392 // Sets all the properties for this unicharset given a src unicharset with | |
| 393 // everything set. The unicharsets don't have to be the same, and graphemes | |
| 394 // are correctly accounted for. | |
| 395 void UNICHARSET::PartialSetPropertiesFromOther(int start_index, | |
| 396 const UNICHARSET &src) { | |
| 397 for (unsigned ch = start_index; ch < unichars.size(); ++ch) { | |
| 398 const char *utf8 = id_to_unichar(ch); | |
| 399 UNICHAR_PROPERTIES properties; | |
| 400 if (src.GetStrProperties(utf8, &properties)) { | |
| 401 // Setup the script_id, other_case, and mirror properly. | |
| 402 const char *script = src.get_script_from_script_id(properties.script_id); | |
| 403 properties.script_id = add_script(script); | |
| 404 const char *other_case = src.id_to_unichar(properties.other_case); | |
| 405 if (contains_unichar(other_case)) { | |
| 406 properties.other_case = unichar_to_id(other_case); | |
| 407 } else { | |
| 408 properties.other_case = ch; | |
| 409 } | |
| 410 const char *mirror_str = src.id_to_unichar(properties.mirror); | |
| 411 if (contains_unichar(mirror_str)) { | |
| 412 properties.mirror = unichar_to_id(mirror_str); | |
| 413 } else { | |
| 414 properties.mirror = ch; | |
| 415 } | |
| 416 unichars[ch].properties.CopyFrom(properties); | |
| 417 set_normed_ids(ch); | |
| 418 } | |
| 419 } | |
| 420 } | |
| 421 | |
| 422 // Expands the tops and bottoms and widths for this unicharset given a | |
| 423 // src unicharset with ranges in it. The unicharsets don't have to be the | |
| 424 // same, and graphemes are correctly accounted for. | |
| 425 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET &src) { | |
| 426 for (unsigned ch = 0; ch < unichars.size(); ++ch) { | |
| 427 const char *utf8 = id_to_unichar(ch); | |
| 428 UNICHAR_PROPERTIES properties; | |
| 429 if (src.GetStrProperties(utf8, &properties)) { | |
| 430 // Expand just the ranges from properties. | |
| 431 unichars[ch].properties.ExpandRangesFrom(properties); | |
| 432 } | |
| 433 } | |
| 434 } | |
| 435 | |
| 436 // Makes this a copy of src. Clears this completely first, so the automatic | |
| 437 // ids will not be present in this if not in src. Does NOT reorder the set! | |
| 438 void UNICHARSET::CopyFrom(const UNICHARSET &src) { | |
| 439 clear(); | |
| 440 for (unsigned ch = 0; ch < src.unichars.size(); ++ch) { | |
| 441 const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties; | |
| 442 const char *utf8 = src.id_to_unichar(ch); | |
| 443 unichar_insert_backwards_compatible(utf8); | |
| 444 unichars[ch].properties.ExpandRangesFrom(src_props); | |
| 445 } | |
| 446 // Set properties, including mirror and other_case, WITHOUT reordering | |
| 447 // the unicharset. | |
| 448 PartialSetPropertiesFromOther(0, src); | |
| 449 } | |
| 450 | |
| 451 // For each id in src, if it does not occur in this, add it, as in | |
| 452 // SetPropertiesFromOther, otherwise expand the ranges, as in | |
| 453 // ExpandRangesFromOther. | |
| 454 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET &src) { | |
| 455 int initial_used = unichars.size(); | |
| 456 for (unsigned ch = 0; ch < src.unichars.size(); ++ch) { | |
| 457 const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties; | |
| 458 const char *utf8 = src.id_to_unichar(ch); | |
| 459 int id = unichars.size(); | |
| 460 if (contains_unichar(utf8)) { | |
| 461 id = unichar_to_id(utf8); | |
| 462 // Just expand current ranges. | |
| 463 unichars[id].properties.ExpandRangesFrom(src_props); | |
| 464 } else { | |
| 465 unichar_insert_backwards_compatible(utf8); | |
| 466 unichars[id].properties.SetRangesEmpty(); | |
| 467 } | |
| 468 } | |
| 469 // Set properties, including mirror and other_case, WITHOUT reordering | |
| 470 // the unicharset. | |
| 471 PartialSetPropertiesFromOther(initial_used, src); | |
| 472 } | |
| 473 | |
| 474 // Returns true if the acceptable ranges of the tops of the characters do | |
| 475 // not overlap, making their x-height calculations distinct. | |
| 476 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const { | |
| 477 int overlap = std::min(unichars[id1].properties.max_top, | |
| 478 unichars[id2].properties.max_top) - | |
| 479 std::max(unichars[id1].properties.min_top, | |
| 480 unichars[id2].properties.min_top); | |
| 481 return overlap <= 0; | |
| 482 } | |
| 483 | |
| 484 // Internal recursive version of encode_string above. | |
| 485 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that | |
| 486 // each UNICHAR_ID uses the least possible part of the utf8 str. | |
| 487 // It does this by depth-first tail recursion on increasing length matches | |
| 488 // to the UNICHARSET, saving the first encountered result that encodes the | |
| 489 // maximum total length of str. It stops on a failure to encode to make | |
| 490 // the overall process of encoding a partially failed string more efficient. | |
| 491 // See unicharset.h for definition of the args. | |
| 492 void UNICHARSET::encode_string(const char *str, int str_index, int str_length, | |
| 493 std::vector<UNICHAR_ID> *encoding, | |
| 494 std::vector<char> *lengths, | |
| 495 unsigned *best_total_length, | |
| 496 std::vector<UNICHAR_ID> *best_encoding, | |
| 497 std::vector<char> *best_lengths) const { | |
| 498 if (str_index > static_cast<int>(*best_total_length)) { | |
| 499 // This is the best result so far. | |
| 500 *best_total_length = str_index; | |
| 501 *best_encoding = *encoding; | |
| 502 if (best_lengths != nullptr) { | |
| 503 *best_lengths = *lengths; | |
| 504 } | |
| 505 } | |
| 506 if (str_index == str_length) { | |
| 507 return; | |
| 508 } | |
| 509 int encoding_index = encoding->size(); | |
| 510 // Find the length of the first matching unicharset member. | |
| 511 int length = ids.minmatch(str + str_index); | |
| 512 if (length == 0 || str_index + length > str_length) { | |
| 513 return; | |
| 514 } | |
| 515 do { | |
| 516 if (ids.contains(str + str_index, length)) { | |
| 517 // Successful encoding so far. | |
| 518 UNICHAR_ID id = ids.unichar_to_id(str + str_index, length); | |
| 519 encoding->push_back(id); | |
| 520 lengths->push_back(length); | |
| 521 encode_string(str, str_index + length, str_length, encoding, lengths, | |
| 522 best_total_length, best_encoding, best_lengths); | |
| 523 if (static_cast<int>(*best_total_length) == str_length) { | |
| 524 return; // Tail recursion success! | |
| 525 } | |
| 526 // Failed with that length, truncate back and try again. | |
| 527 encoding->resize(encoding_index); | |
| 528 lengths->resize(encoding_index); | |
| 529 } | |
| 530 int step = UNICHAR::utf8_step(str + str_index + length); | |
| 531 if (step == 0) { | |
| 532 step = 1; | |
| 533 } | |
| 534 length += step; | |
| 535 } while (length <= UNICHAR_LEN && str_index + length <= str_length); | |
| 536 } | |
| 537 | |
| 538 // Gets the properties for a grapheme string, combining properties for | |
| 539 // multiple characters in a meaningful way where possible. | |
| 540 // Returns false if no valid match was found in the unicharset. | |
| 541 // NOTE that script_id, mirror, and other_case refer to this unicharset on | |
| 542 // return and will need translation if the target unicharset is different. | |
| 543 bool UNICHARSET::GetStrProperties(const char *utf8_str, | |
| 544 UNICHAR_PROPERTIES *props) const { | |
| 545 props->Init(); | |
| 546 props->SetRangesEmpty(); | |
| 547 int total_unicodes = 0; | |
| 548 std::vector<UNICHAR_ID> encoding; | |
| 549 if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr)) { | |
| 550 return false; // Some part was invalid. | |
| 551 } | |
| 552 for (auto it : encoding) { | |
| 553 int id = it; | |
| 554 const UNICHAR_PROPERTIES &src_props = unichars[id].properties; | |
| 555 // Logical OR all the bools. | |
| 556 if (src_props.isalpha) { | |
| 557 props->isalpha = true; | |
| 558 } | |
| 559 if (src_props.islower) { | |
| 560 props->islower = true; | |
| 561 } | |
| 562 if (src_props.isupper) { | |
| 563 props->isupper = true; | |
| 564 } | |
| 565 if (src_props.isdigit) { | |
| 566 props->isdigit = true; | |
| 567 } | |
| 568 if (src_props.ispunctuation) { | |
| 569 props->ispunctuation = true; | |
| 570 } | |
| 571 if (src_props.isngram) { | |
| 572 props->isngram = true; | |
| 573 } | |
| 574 if (src_props.enabled) { | |
| 575 props->enabled = true; | |
| 576 } | |
| 577 // Min/max the tops/bottoms. | |
| 578 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom); | |
| 579 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom); | |
| 580 UpdateRange(src_props.min_top, &props->min_top, &props->max_top); | |
| 581 UpdateRange(src_props.max_top, &props->min_top, &props->max_top); | |
| 582 float bearing = props->advance + src_props.bearing; | |
| 583 if (total_unicodes == 0 || bearing < props->bearing) { | |
| 584 props->bearing = bearing; | |
| 585 props->bearing_sd = props->advance_sd + src_props.bearing_sd; | |
| 586 } | |
| 587 props->advance += src_props.advance; | |
| 588 props->advance_sd += src_props.advance_sd; | |
| 589 // With a single width, just use the widths stored in the unicharset. | |
| 590 props->width = src_props.width; | |
| 591 props->width_sd = src_props.width_sd; | |
| 592 // Use the first script id, other_case, mirror, direction. | |
| 593 // Note that these will need translation, except direction. | |
| 594 if (total_unicodes == 0) { | |
| 595 props->script_id = src_props.script_id; | |
| 596 props->other_case = src_props.other_case; | |
| 597 props->mirror = src_props.mirror; | |
| 598 props->direction = src_props.direction; | |
| 599 } | |
| 600 // The normed string for the compound character is the concatenation of | |
| 601 // the normed versions of the individual characters. | |
| 602 props->normed += src_props.normed; | |
| 603 ++total_unicodes; | |
| 604 } | |
| 605 if (total_unicodes > 1) { | |
| 606 // Estimate the total widths from the advance - bearing. | |
| 607 props->width = props->advance - props->bearing; | |
| 608 props->width_sd = props->advance_sd + props->bearing_sd; | |
| 609 } | |
| 610 return total_unicodes > 0; | |
| 611 } | |
| 612 | |
| 613 // TODO(rays) clean-up the order of functions to match unicharset.h. | |
| 614 | |
| 615 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const { | |
| 616 unsigned int properties = 0; | |
| 617 if (this->get_isalpha(id)) { | |
| 618 properties |= ISALPHA_MASK; | |
| 619 } | |
| 620 if (this->get_islower(id)) { | |
| 621 properties |= ISLOWER_MASK; | |
| 622 } | |
| 623 if (this->get_isupper(id)) { | |
| 624 properties |= ISUPPER_MASK; | |
| 625 } | |
| 626 if (this->get_isdigit(id)) { | |
| 627 properties |= ISDIGIT_MASK; | |
| 628 } | |
| 629 if (this->get_ispunctuation(id)) { | |
| 630 properties |= ISPUNCTUATION_MASK; | |
| 631 } | |
| 632 return properties; | |
| 633 } | |
| 634 | |
| 635 char UNICHARSET::get_chartype(UNICHAR_ID id) const { | |
| 636 if (this->get_isupper(id)) { | |
| 637 return 'A'; | |
| 638 } | |
| 639 if (this->get_islower(id)) { | |
| 640 return 'a'; | |
| 641 } | |
| 642 if (this->get_isalpha(id)) { | |
| 643 return 'x'; | |
| 644 } | |
| 645 if (this->get_isdigit(id)) { | |
| 646 return '0'; | |
| 647 } | |
| 648 if (this->get_ispunctuation(id)) { | |
| 649 return 'p'; | |
| 650 } | |
| 651 return 0; | |
| 652 } | |
| 653 | |
| 654 void UNICHARSET::unichar_insert(const char *const unichar_repr, | |
| 655 OldUncleanUnichars old_style) { | |
| 656 if (old_style == OldUncleanUnichars::kTrue) { | |
| 657 old_style_included_ = true; | |
| 658 } | |
| 659 std::string cleaned = | |
| 660 old_style_included_ ? unichar_repr : CleanupString(unichar_repr); | |
| 661 if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) { | |
| 662 const char *str = cleaned.c_str(); | |
| 663 std::vector<int> encoding; | |
| 664 if (!old_style_included_ && | |
| 665 encode_string(str, true, &encoding, nullptr, nullptr)) { | |
| 666 return; | |
| 667 } | |
| 668 unichars.emplace_back(); | |
| 669 auto &u = unichars.back(); | |
| 670 int index = 0; | |
| 671 do { | |
| 672 if (index >= UNICHAR_LEN) { | |
| 673 fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN, | |
| 674 unichar_repr); | |
| 675 return; | |
| 676 } | |
| 677 u.representation[index++] = *str++; | |
| 678 } while (*str != '\0'); | |
| 679 u.representation[index] = '\0'; | |
| 680 this->set_script(unichars.size() - 1, null_script); | |
| 681 // If the given unichar_repr represents a fragmented character, set | |
| 682 // fragment property to a pointer to CHAR_FRAGMENT class instance with | |
| 683 // information parsed from the unichar representation. Use the script | |
| 684 // of the base unichar for the fragmented character if possible. | |
| 685 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation); | |
| 686 u.properties.fragment = frag; | |
| 687 if (frag != nullptr && this->contains_unichar(frag->get_unichar())) { | |
| 688 u.properties.script_id = this->get_script(frag->get_unichar()); | |
| 689 } | |
| 690 u.properties.enabled = true; | |
| 691 ids.insert(u.representation, unichars.size() - 1); | |
| 692 } | |
| 693 } | |
| 694 | |
| 695 bool UNICHARSET::contains_unichar(const char *const unichar_repr) const { | |
| 696 std::string cleaned = | |
| 697 old_style_included_ ? unichar_repr : CleanupString(unichar_repr); | |
| 698 return ids.contains(cleaned.data(), cleaned.size()); | |
| 699 } | |
| 700 | |
| 701 bool UNICHARSET::contains_unichar(const char *const unichar_repr, | |
| 702 int length) const { | |
| 703 if (length == 0) { | |
| 704 return false; | |
| 705 } | |
| 706 std::string cleaned(unichar_repr, length); | |
| 707 if (!old_style_included_) { | |
| 708 cleaned = CleanupString(unichar_repr, length); | |
| 709 } | |
| 710 return ids.contains(cleaned.data(), cleaned.size()); | |
| 711 } | |
| 712 | |
| 713 bool UNICHARSET::eq(UNICHAR_ID unichar_id, | |
| 714 const char *const unichar_repr) const { | |
| 715 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0; | |
| 716 } | |
| 717 | |
| 718 bool UNICHARSET::save_to_string(std::string &str) const { | |
| 719 const int kFileBufSize = 1024; | |
| 720 char buffer[kFileBufSize + 1]; | |
| 721 snprintf(buffer, kFileBufSize, "%zu\n", this->size()); | |
| 722 str = buffer; | |
| 723 for (unsigned id = 0; id < this->size(); ++id) { | |
| 724 int min_bottom, max_bottom, min_top, max_top; | |
| 725 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); | |
| 726 float width, width_sd; | |
| 727 get_width_stats(id, &width, &width_sd); | |
| 728 float bearing, bearing_sd; | |
| 729 get_bearing_stats(id, &bearing, &bearing_sd); | |
| 730 float advance, advance_sd; | |
| 731 get_advance_stats(id, &advance, &advance_sd); | |
| 732 unsigned int properties = this->get_properties(id); | |
| 733 if (strcmp(this->id_to_unichar(id), " ") == 0) { | |
| 734 snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties, | |
| 735 this->get_script_from_script_id(this->get_script(id)), | |
| 736 this->get_other_case(id)); | |
| 737 str += buffer; | |
| 738 } else { | |
| 739 std::ostringstream stream; | |
| 740 stream.imbue(std::locale::classic()); | |
| 741 stream << this->id_to_unichar(id) << ' ' << properties << ' ' | |
| 742 << min_bottom << ',' << max_bottom << ',' << min_top << ',' | |
| 743 << max_top << ',' << width << ',' << width_sd << ',' << bearing | |
| 744 << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' ' | |
| 745 << this->get_script_from_script_id(this->get_script(id)) << ' ' | |
| 746 << this->get_other_case(id) << ' ' << this->get_direction(id) | |
| 747 << ' ' << this->get_mirror(id) << ' ' | |
| 748 << this->get_normed_unichar(id) << "\t# " | |
| 749 << this->debug_str(id).c_str() << '\n'; | |
| 750 str += stream.str().c_str(); | |
| 751 } | |
| 752 } | |
| 753 return true; | |
| 754 } | |
| 755 | |
| 756 class LocalFilePointer { | |
| 757 public: | |
| 758 LocalFilePointer(FILE *stream) : fp_(stream) {} | |
| 759 char *fgets(char *dst, int size) { | |
| 760 return ::fgets(dst, size, fp_); | |
| 761 } | |
| 762 | |
| 763 private: | |
| 764 FILE *fp_; | |
| 765 }; | |
| 766 | |
| 767 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) { | |
| 768 LocalFilePointer lfp(file); | |
| 769 using namespace std::placeholders; // for _1, _2 | |
| 770 std::function<char *(char *, int)> fgets_cb = | |
| 771 std::bind(&LocalFilePointer::fgets, &lfp, _1, _2); | |
| 772 bool success = load_via_fgets(fgets_cb, skip_fragments); | |
| 773 return success; | |
| 774 } | |
| 775 | |
| 776 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) { | |
| 777 using namespace std::placeholders; // for _1, _2 | |
| 778 std::function<char *(char *, int)> fgets_cb = | |
| 779 std::bind(&tesseract::TFile::FGets, file, _1, _2); | |
| 780 bool success = load_via_fgets(fgets_cb, skip_fragments); | |
| 781 return success; | |
| 782 } | |
| 783 | |
| 784 bool UNICHARSET::load_via_fgets( | |
| 785 const std::function<char *(char *, int)> &fgets_cb, bool skip_fragments) { | |
| 786 int unicharset_size; | |
| 787 char buffer[256]; | |
| 788 | |
| 789 this->clear(); | |
| 790 if (fgets_cb(buffer, sizeof(buffer)) == nullptr || | |
| 791 sscanf(buffer, "%d", &unicharset_size) != 1) { | |
| 792 return false; | |
| 793 } | |
| 794 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) { | |
| 795 char unichar[256]; | |
| 796 unsigned int properties; | |
| 797 char script[64]; | |
| 798 | |
| 799 strncpy(script, null_script, sizeof(script) - 1); | |
| 800 int min_bottom = 0; | |
| 801 int max_bottom = UINT8_MAX; | |
| 802 int min_top = 0; | |
| 803 int max_top = UINT8_MAX; | |
| 804 float width = 0.0f; | |
| 805 float width_sd = 0.0f; | |
| 806 float bearing = 0.0f; | |
| 807 float bearing_sd = 0.0f; | |
| 808 float advance = 0.0f; | |
| 809 float advance_sd = 0.0f; | |
| 810 // TODO(eger): check that this default it ok | |
| 811 // after enabling BiDi iterator for Arabic. | |
| 812 int direction = UNICHARSET::U_LEFT_TO_RIGHT; | |
| 813 UNICHAR_ID other_case = unicharset_size; | |
| 814 UNICHAR_ID mirror = unicharset_size; | |
| 815 if (fgets_cb(buffer, sizeof(buffer)) == nullptr) { | |
| 816 return false; | |
| 817 } | |
| 818 char normed[64]; | |
| 819 normed[0] = '\0'; | |
| 820 std::istringstream stream(buffer); | |
| 821 stream.imbue(std::locale::classic()); | |
| 822 // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x | |
| 823 // stream.flags(std::ios::hex); | |
| 824 stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec; | |
| 825 // stream.flags(std::ios::dec); | |
| 826 if (stream.fail()) { | |
| 827 fprintf(stderr, "%s:%d failed\n", __FILE__, __LINE__); | |
| 828 return false; | |
| 829 } | |
| 830 auto position = stream.tellg(); | |
| 831 stream.seekg(position); | |
| 832 char c1, c2, c3, c4, c5, c6, c7, c8, c9; | |
| 833 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> | |
| 834 max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >> | |
| 835 bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >> | |
| 836 script >> other_case >> direction >> mirror >> std::setw(63) >> normed; | |
| 837 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' || | |
| 838 c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') { | |
| 839 stream.clear(); | |
| 840 stream.seekg(position); | |
| 841 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> | |
| 842 max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >> | |
| 843 bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >> | |
| 844 script >> other_case >> direction >> mirror; | |
| 845 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' || | |
| 846 c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') { | |
| 847 stream.clear(); | |
| 848 stream.seekg(position); | |
| 849 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> | |
| 850 max_top >> std::setw(63) >> script >> other_case >> direction >> | |
| 851 mirror; | |
| 852 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') { | |
| 853 stream.clear(); | |
| 854 stream.seekg(position); | |
| 855 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> | |
| 856 max_top >> std::setw(63) >> script >> other_case; | |
| 857 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') { | |
| 858 stream.clear(); | |
| 859 stream.seekg(position); | |
| 860 stream >> std::setw(63) >> script >> other_case; | |
| 861 if (stream.fail()) { | |
| 862 stream.clear(); | |
| 863 stream.seekg(position); | |
| 864 stream >> std::setw(63) >> script; | |
| 865 } | |
| 866 } | |
| 867 } | |
| 868 } | |
| 869 } | |
| 870 | |
| 871 // Skip fragments if needed. | |
| 872 CHAR_FRAGMENT *frag = nullptr; | |
| 873 if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) { | |
| 874 int num_pieces = frag->get_total(); | |
| 875 delete frag; | |
| 876 // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in. | |
| 877 if (num_pieces > 1) { | |
| 878 continue; | |
| 879 } | |
| 880 } | |
| 881 // Insert unichar into unicharset and set its properties. | |
| 882 if (strcmp(unichar, "NULL") == 0) { | |
| 883 this->unichar_insert(" "); | |
| 884 } else { | |
| 885 this->unichar_insert_backwards_compatible(unichar); | |
| 886 } | |
| 887 | |
| 888 this->set_isalpha(id, properties & ISALPHA_MASK); | |
| 889 this->set_islower(id, properties & ISLOWER_MASK); | |
| 890 this->set_isupper(id, properties & ISUPPER_MASK); | |
| 891 this->set_isdigit(id, properties & ISDIGIT_MASK); | |
| 892 this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK); | |
| 893 this->set_isngram(id, false); | |
| 894 this->set_script(id, script); | |
| 895 this->unichars[id].properties.enabled = true; | |
| 896 this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top); | |
| 897 this->set_width_stats(id, width, width_sd); | |
| 898 this->set_bearing_stats(id, bearing, bearing_sd); | |
| 899 this->set_advance_stats(id, advance, advance_sd); | |
| 900 this->set_direction(id, static_cast<UNICHARSET::Direction>(direction)); | |
| 901 this->set_other_case(id, (other_case < unicharset_size) ? other_case : id); | |
| 902 this->set_mirror(id, (mirror < unicharset_size) ? mirror : id); | |
| 903 this->set_normed(id, normed[0] != '\0' ? normed : unichar); | |
| 904 } | |
| 905 post_load_setup(); | |
| 906 return true; | |
| 907 } | |
| 908 | |
| 909 // Sets up internal data after loading the file, based on the char | |
| 910 // properties. Called from load_from_file, but also needs to be run | |
| 911 // during set_unicharset_properties. | |
| 912 void UNICHARSET::post_load_setup() { | |
| 913 // Number of alpha chars with the case property minus those without, | |
| 914 // in order to determine that half the alpha chars have case. | |
| 915 int net_case_alphas = 0; | |
| 916 int x_height_alphas = 0; | |
| 917 int cap_height_alphas = 0; | |
| 918 top_bottom_set_ = false; | |
| 919 for (unsigned id = 0; id < unichars.size(); ++id) { | |
| 920 int min_bottom = 0; | |
| 921 int max_bottom = UINT8_MAX; | |
| 922 int min_top = 0; | |
| 923 int max_top = UINT8_MAX; | |
| 924 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); | |
| 925 if (min_top > 0) { | |
| 926 top_bottom_set_ = true; | |
| 927 } | |
| 928 if (get_isalpha(id)) { | |
| 929 if (get_islower(id) || get_isupper(id)) { | |
| 930 ++net_case_alphas; | |
| 931 } else { | |
| 932 --net_case_alphas; | |
| 933 } | |
| 934 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) { | |
| 935 ++x_height_alphas; | |
| 936 } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) { | |
| 937 ++cap_height_alphas; | |
| 938 } | |
| 939 } | |
| 940 set_normed_ids(id); | |
| 941 } | |
| 942 | |
| 943 script_has_upper_lower_ = net_case_alphas > 0; | |
| 944 script_has_xheight_ = | |
| 945 script_has_upper_lower_ || | |
| 946 (x_height_alphas > cap_height_alphas * kMinXHeightFraction && | |
| 947 cap_height_alphas > x_height_alphas * kMinCapHeightFraction); | |
| 948 | |
| 949 null_sid_ = get_script_id_from_name(null_script); | |
| 950 ASSERT_HOST(null_sid_ == 0); | |
| 951 common_sid_ = get_script_id_from_name("Common"); | |
| 952 latin_sid_ = get_script_id_from_name("Latin"); | |
| 953 cyrillic_sid_ = get_script_id_from_name("Cyrillic"); | |
| 954 greek_sid_ = get_script_id_from_name("Greek"); | |
| 955 han_sid_ = get_script_id_from_name("Han"); | |
| 956 hiragana_sid_ = get_script_id_from_name("Hiragana"); | |
| 957 katakana_sid_ = get_script_id_from_name("Katakana"); | |
| 958 thai_sid_ = get_script_id_from_name("Thai"); | |
| 959 hangul_sid_ = get_script_id_from_name("Hangul"); | |
| 960 | |
| 961 // Compute default script. Use the highest-counting alpha script, that is | |
| 962 // not the common script, as that still contains some "alphas". | |
| 963 int *script_counts = new int[script_table_size_used]; | |
| 964 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used); | |
| 965 for (unsigned id = 0; id < unichars.size(); ++id) { | |
| 966 if (get_isalpha(id)) { | |
| 967 ++script_counts[get_script(id)]; | |
| 968 } | |
| 969 } | |
| 970 default_sid_ = 0; | |
| 971 for (int s = 1; s < script_table_size_used; ++s) { | |
| 972 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) { | |
| 973 default_sid_ = s; | |
| 974 } | |
| 975 } | |
| 976 delete[] script_counts; | |
| 977 } | |
| 978 | |
| 979 // Returns true if right_to_left scripts are significant in the unicharset, | |
| 980 // but without being so sensitive that "universal" unicharsets containing | |
| 981 // characters from many scripts, like orientation and script detection, | |
| 982 // look like they are right_to_left. | |
| 983 bool UNICHARSET::major_right_to_left() const { | |
| 984 int ltr_count = 0; | |
| 985 int rtl_count = 0; | |
| 986 for (unsigned id = 0; id < unichars.size(); ++id) { | |
| 987 int dir = get_direction(id); | |
| 988 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) { | |
| 989 ltr_count++; | |
| 990 } | |
| 991 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || | |
| 992 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || | |
| 993 dir == UNICHARSET::U_ARABIC_NUMBER) { | |
| 994 rtl_count++; | |
| 995 } | |
| 996 } | |
| 997 return rtl_count > ltr_count; | |
| 998 } | |
| 999 | |
| 1000 // Set a whitelist and/or blacklist of characters to recognize. | |
| 1001 // An empty or nullptr whitelist enables everything (minus any blacklist). | |
| 1002 // An empty or nullptr blacklist disables nothing. | |
| 1003 // An empty or nullptr unblacklist has no effect. | |
| 1004 void UNICHARSET::set_black_and_whitelist(const char *blacklist, | |
| 1005 const char *whitelist, | |
| 1006 const char *unblacklist) { | |
| 1007 bool def_enabled = whitelist == nullptr || whitelist[0] == '\0'; | |
| 1008 // Set everything to default | |
| 1009 for (auto &uc : unichars) { | |
| 1010 uc.properties.enabled = def_enabled; | |
| 1011 } | |
| 1012 if (!def_enabled) { | |
| 1013 // Enable the whitelist. | |
| 1014 std::vector<UNICHAR_ID> encoding; | |
| 1015 encode_string(whitelist, false, &encoding, nullptr, nullptr); | |
| 1016 for (auto it : encoding) { | |
| 1017 if (it != INVALID_UNICHAR_ID) { | |
| 1018 unichars[it].properties.enabled = true; | |
| 1019 } | |
| 1020 } | |
| 1021 } | |
| 1022 if (blacklist != nullptr && blacklist[0] != '\0') { | |
| 1023 // Disable the blacklist. | |
| 1024 std::vector<UNICHAR_ID> encoding; | |
| 1025 encode_string(blacklist, false, &encoding, nullptr, nullptr); | |
| 1026 for (auto it : encoding) { | |
| 1027 if (it != INVALID_UNICHAR_ID) { | |
| 1028 unichars[it].properties.enabled = false; | |
| 1029 } | |
| 1030 } | |
| 1031 } | |
| 1032 if (unblacklist != nullptr && unblacklist[0] != '\0') { | |
| 1033 // Re-enable the unblacklist. | |
| 1034 std::vector<UNICHAR_ID> encoding; | |
| 1035 encode_string(unblacklist, false, &encoding, nullptr, nullptr); | |
| 1036 for (auto it : encoding) { | |
| 1037 if (it != INVALID_UNICHAR_ID) { | |
| 1038 unichars[it].properties.enabled = true; | |
| 1039 } | |
| 1040 } | |
| 1041 } | |
| 1042 } | |
| 1043 | |
| 1044 // Returns true if there are any repeated unicodes in the normalized | |
| 1045 // text of any unichar-id in the unicharset. | |
| 1046 bool UNICHARSET::AnyRepeatedUnicodes() const { | |
| 1047 int start_id = 0; | |
| 1048 if (has_special_codes()) { | |
| 1049 start_id = SPECIAL_UNICHAR_CODES_COUNT; | |
| 1050 } | |
| 1051 for (unsigned id = start_id; id < unichars.size(); ++id) { | |
| 1052 // Convert to unicodes. | |
| 1053 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id)); | |
| 1054 for (size_t u = 1; u < unicodes.size(); ++u) { | |
| 1055 if (unicodes[u - 1] == unicodes[u]) { | |
| 1056 return true; | |
| 1057 } | |
| 1058 } | |
| 1059 } | |
| 1060 return false; | |
| 1061 } | |
| 1062 | |
| 1063 int UNICHARSET::add_script(const char *script) { | |
| 1064 for (int i = 0; i < script_table_size_used; ++i) { | |
| 1065 if (strcmp(script, script_table[i]) == 0) { | |
| 1066 return i; | |
| 1067 } | |
| 1068 } | |
| 1069 if (script_table_size_reserved == 0) { | |
| 1070 script_table_size_reserved = 8; | |
| 1071 script_table = new char *[script_table_size_reserved]; | |
| 1072 } else if (script_table_size_used >= script_table_size_reserved) { | |
| 1073 assert(script_table_size_used == script_table_size_reserved); | |
| 1074 script_table_size_reserved += script_table_size_reserved; | |
| 1075 char **new_script_table = new char *[script_table_size_reserved]; | |
| 1076 memcpy(new_script_table, script_table, | |
| 1077 script_table_size_used * sizeof(char *)); | |
| 1078 delete[] script_table; | |
| 1079 script_table = new_script_table; | |
| 1080 } | |
| 1081 script_table[script_table_size_used] = new char[strlen(script) + 1]; | |
| 1082 strcpy(script_table[script_table_size_used], script); | |
| 1083 return script_table_size_used++; | |
| 1084 } | |
| 1085 | |
| 1086 // Returns the string that represents a fragment | |
| 1087 // with the given unichar, pos and total. | |
| 1088 std::string CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total, | |
| 1089 bool natural) { | |
| 1090 if (total == 1) { | |
| 1091 return std::string(unichar); | |
| 1092 } | |
| 1093 std::string result; | |
| 1094 result += kSeparator; | |
| 1095 result += unichar; | |
| 1096 char buffer[kMaxLen]; | |
| 1097 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, | |
| 1098 natural ? kNaturalFlag : kSeparator, total); | |
| 1099 result += buffer; | |
| 1100 return result; | |
| 1101 } | |
| 1102 | |
| 1103 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) { | |
| 1104 const char *ptr = string; | |
| 1105 int len = strlen(string); | |
| 1106 if (len < kMinLen || *ptr != kSeparator) { | |
| 1107 return nullptr; // this string cannot represent a fragment | |
| 1108 } | |
| 1109 ptr++; // move to the next character | |
| 1110 int step = 0; | |
| 1111 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) { | |
| 1112 step += UNICHAR::utf8_step(ptr + step); | |
| 1113 } | |
| 1114 if (step == 0 || step > UNICHAR_LEN) { | |
| 1115 return nullptr; // no character for unichar or the character is too long | |
| 1116 } | |
| 1117 char unichar[UNICHAR_LEN + 1]; | |
| 1118 strncpy(unichar, ptr, step); | |
| 1119 unichar[step] = '\0'; // null terminate unichar | |
| 1120 ptr += step; // move to the next fragment separator | |
| 1121 int pos = 0; | |
| 1122 int total = 0; | |
| 1123 bool natural = false; | |
| 1124 char *end_ptr = nullptr; | |
| 1125 for (int i = 0; i < 2; i++) { | |
| 1126 if (ptr > string + len || *ptr != kSeparator) { | |
| 1127 if (i == 1 && *ptr == kNaturalFlag) { | |
| 1128 natural = true; | |
| 1129 } else { | |
| 1130 return nullptr; // Failed to parse fragment representation. | |
| 1131 } | |
| 1132 } | |
| 1133 ptr++; // move to the next character | |
| 1134 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10)) | |
| 1135 : total = static_cast<int>(strtol(ptr, &end_ptr, 10)); | |
| 1136 ptr = end_ptr; | |
| 1137 } | |
| 1138 if (ptr != string + len) { | |
| 1139 return nullptr; // malformed fragment representation | |
| 1140 } | |
| 1141 auto *fragment = new CHAR_FRAGMENT(); | |
| 1142 fragment->set_all(unichar, pos, total, natural); | |
| 1143 return fragment; | |
| 1144 } | |
| 1145 | |
| 1146 int UNICHARSET::get_script_id_from_name(const char *script_name) const { | |
| 1147 for (int i = 0; i < script_table_size_used; ++i) { | |
| 1148 if (strcmp(script_name, script_table[i]) == 0) { | |
| 1149 return i; | |
| 1150 } | |
| 1151 } | |
| 1152 return 0; // 0 is always the null_script | |
| 1153 } | |
| 1154 | |
| 1155 // Removes/replaces content that belongs in rendered text, but not in the | |
| 1156 // unicharset. | |
| 1157 /* static */ | |
| 1158 std::string UNICHARSET::CleanupString(const char *utf8_str, size_t length) { | |
| 1159 std::string result; | |
| 1160 result.reserve(length); | |
| 1161 char ch; | |
| 1162 while ((ch = *utf8_str) != '\0' && length-- > 0) { | |
| 1163 int key_index = 0; | |
| 1164 const char *key; | |
| 1165 while ((key = kCleanupMaps[key_index][0]) != nullptr) { | |
| 1166 int match = 0; | |
| 1167 while (key[match] != '\0' && key[match] == utf8_str[match]) { | |
| 1168 ++match; | |
| 1169 } | |
| 1170 if (key[match] == '\0') { | |
| 1171 utf8_str += match; | |
| 1172 break; | |
| 1173 } | |
| 1174 ++key_index; | |
| 1175 } | |
| 1176 if (key == nullptr) { | |
| 1177 result.push_back(ch); | |
| 1178 ++utf8_str; | |
| 1179 } else { | |
| 1180 result.append(kCleanupMaps[key_index][1]); | |
| 1181 } | |
| 1182 } | |
| 1183 return result; | |
| 1184 } | |
| 1185 | |
| 1186 } // namespace tesseract |
