Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: unicharset.h | |
| 3 // Description: Unicode character/ligature set class. | |
| 4 // Author: Thomas Kielbus | |
| 5 // | |
| 6 // (C) Copyright 2006, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_ | |
| 20 #define TESSERACT_CCUTIL_UNICHARSET_H_ | |
| 21 | |
| 22 #include "errcode.h" | |
| 23 #include "unicharmap.h" | |
| 24 | |
| 25 #include <tesseract/unichar.h> | |
| 26 #include "helpers.h" | |
| 27 #include "serialis.h" | |
| 28 | |
| 29 #include <functional> // for std::function | |
| 30 | |
| 31 namespace tesseract { | |
| 32 | |
| 33 // Enum holding special values of unichar_id. Every unicharset has these. | |
| 34 // Warning! Keep in sync with kSpecialUnicharCodes. | |
| 35 enum SpecialUnicharCodes { | |
| 36 UNICHAR_SPACE, | |
| 37 UNICHAR_JOINED, | |
| 38 UNICHAR_BROKEN, | |
| 39 | |
| 40 SPECIAL_UNICHAR_CODES_COUNT | |
| 41 }; | |
| 42 | |
| 43 // Boolean flag for unichar_insert. It's a bit of a double negative to allow | |
| 44 // the default value to be false. | |
| 45 enum class OldUncleanUnichars { | |
| 46 kFalse, | |
| 47 kTrue, | |
| 48 }; | |
| 49 | |
| 50 class TESS_API CHAR_FRAGMENT { | |
| 51 public: | |
| 52 // Minimum number of characters used for fragment representation. | |
| 53 static const int kMinLen = 6; | |
| 54 // Maximum number of characters used for fragment representation. | |
| 55 static const int kMaxLen = 3 + UNICHAR_LEN + 2; | |
| 56 // Maximum number of fragments per character. | |
| 57 static const int kMaxChunks = 5; | |
| 58 | |
| 59 // Setters and Getters. | |
| 60 inline void set_all(const char *unichar, int pos, int total, bool natural) { | |
| 61 set_unichar(unichar); | |
| 62 set_pos(pos); | |
| 63 set_total(total); | |
| 64 set_natural(natural); | |
| 65 } | |
| 66 inline void set_unichar(const char *uch) { | |
| 67 strncpy(this->unichar, uch, sizeof(this->unichar)); | |
| 68 this->unichar[UNICHAR_LEN] = '\0'; | |
| 69 } | |
| 70 inline void set_pos(int p) { | |
| 71 this->pos = p; | |
| 72 } | |
| 73 inline void set_total(int t) { | |
| 74 this->total = t; | |
| 75 } | |
| 76 inline const char *get_unichar() const { | |
| 77 return this->unichar; | |
| 78 } | |
| 79 inline int get_pos() const { | |
| 80 return this->pos; | |
| 81 } | |
| 82 inline int get_total() const { | |
| 83 return this->total; | |
| 84 } | |
| 85 | |
| 86 // Returns the string that represents a fragment | |
| 87 // with the given unichar, pos and total. | |
| 88 static std::string to_string(const char *unichar, int pos, int total, | |
| 89 bool natural); | |
| 90 // Returns the string that represents this fragment. | |
| 91 std::string to_string() const { | |
| 92 return to_string(unichar, pos, total, natural); | |
| 93 } | |
| 94 | |
| 95 // Checks whether a fragment has the same unichar, | |
| 96 // position and total as the given inputs. | |
| 97 inline bool equals(const char *other_unichar, int other_pos, | |
| 98 int other_total) const { | |
| 99 return (strcmp(this->unichar, other_unichar) == 0 && | |
| 100 this->pos == other_pos && this->total == other_total); | |
| 101 } | |
| 102 inline bool equals(const CHAR_FRAGMENT *other) const { | |
| 103 return this->equals(other->get_unichar(), other->get_pos(), | |
| 104 other->get_total()); | |
| 105 } | |
| 106 | |
| 107 // Checks whether a given fragment is a continuation of this fragment. | |
| 108 // Assumes that the given fragment pointer is not nullptr. | |
| 109 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { | |
| 110 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && | |
| 111 this->total == fragment->get_total() && | |
| 112 this->pos == fragment->get_pos() + 1); | |
| 113 } | |
| 114 | |
| 115 // Returns true if this fragment is a beginning fragment. | |
| 116 inline bool is_beginning() const { | |
| 117 return this->pos == 0; | |
| 118 } | |
| 119 | |
| 120 // Returns true if this fragment is an ending fragment. | |
| 121 inline bool is_ending() const { | |
| 122 return this->pos == this->total - 1; | |
| 123 } | |
| 124 | |
| 125 // Returns true if the fragment was a separate component to begin with, | |
| 126 // ie did not need chopping to be isolated, but may have been separated | |
| 127 // out from a multi-outline blob. | |
| 128 inline bool is_natural() const { | |
| 129 return natural; | |
| 130 } | |
| 131 void set_natural(bool value) { | |
| 132 natural = value; | |
| 133 } | |
| 134 | |
| 135 // Parses the string to see whether it represents a character fragment | |
| 136 // (rather than a regular character). If so, allocates memory for a new | |
| 137 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment | |
| 138 // information. Fragments are of the form: | |
| 139 // |m|1|2, meaning chunk 1 of 2 of character m, or | |
| 140 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed | |
| 141 // to divide the parts, as they were already separate connected components. | |
| 142 // | |
| 143 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT | |
| 144 // instance, otherwise (if the string does not represent a fragment or it | |
| 145 // looks like it does, but parsing it as a fragment fails) returns nullptr. | |
| 146 // | |
| 147 // Note: The caller is responsible for deallocating memory | |
| 148 // associated with the returned pointer. | |
| 149 static CHAR_FRAGMENT *parse_from_string(const char *str); | |
| 150 | |
| 151 private: | |
| 152 char unichar[UNICHAR_LEN + 1]; | |
| 153 // True if the fragment was a separate component to begin with, | |
| 154 // ie did not need chopping to be isolated, but may have been separated | |
| 155 // out from a multi-outline blob. | |
| 156 bool natural; | |
| 157 int16_t pos; // fragment position in the character | |
| 158 int16_t total; // total number of fragments in the character | |
| 159 }; | |
| 160 | |
| 161 // The UNICHARSET class is an utility class for Tesseract that holds the | |
| 162 // set of characters that are used by the engine. Each character is identified | |
| 163 // by a unique number, from 0 to (size - 1). | |
| 164 class TESS_API UNICHARSET { | |
| 165 public: | |
| 166 // Custom list of characters and their ligature forms (UTF8) | |
| 167 // These map to unicode values in the private use area (PUC) and are supported | |
| 168 // by only few font families (eg. Wyld, Adobe Caslon Pro). | |
| 169 static const char *kCustomLigatures[][2]; | |
| 170 | |
| 171 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. | |
| 172 static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; | |
| 173 | |
| 174 // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h) | |
| 175 enum Direction { | |
| 176 U_LEFT_TO_RIGHT = 0, | |
| 177 U_RIGHT_TO_LEFT = 1, | |
| 178 U_EUROPEAN_NUMBER = 2, | |
| 179 U_EUROPEAN_NUMBER_SEPARATOR = 3, | |
| 180 U_EUROPEAN_NUMBER_TERMINATOR = 4, | |
| 181 U_ARABIC_NUMBER = 5, | |
| 182 U_COMMON_NUMBER_SEPARATOR = 6, | |
| 183 U_BLOCK_SEPARATOR = 7, | |
| 184 U_SEGMENT_SEPARATOR = 8, | |
| 185 U_WHITE_SPACE_NEUTRAL = 9, | |
| 186 U_OTHER_NEUTRAL = 10, | |
| 187 U_LEFT_TO_RIGHT_EMBEDDING = 11, | |
| 188 U_LEFT_TO_RIGHT_OVERRIDE = 12, | |
| 189 U_RIGHT_TO_LEFT_ARABIC = 13, | |
| 190 U_RIGHT_TO_LEFT_EMBEDDING = 14, | |
| 191 U_RIGHT_TO_LEFT_OVERRIDE = 15, | |
| 192 U_POP_DIRECTIONAL_FORMAT = 16, | |
| 193 U_DIR_NON_SPACING_MARK = 17, | |
| 194 U_BOUNDARY_NEUTRAL = 18, | |
| 195 U_FIRST_STRONG_ISOLATE = 19, | |
| 196 U_LEFT_TO_RIGHT_ISOLATE = 20, | |
| 197 U_RIGHT_TO_LEFT_ISOLATE = 21, | |
| 198 U_POP_DIRECTIONAL_ISOLATE = 22, | |
| 199 #ifndef U_HIDE_DEPRECATED_API | |
| 200 U_CHAR_DIRECTION_COUNT | |
| 201 #endif // U_HIDE_DEPRECATED_API | |
| 202 }; | |
| 203 | |
| 204 // Create an empty UNICHARSET | |
| 205 UNICHARSET(); | |
| 206 | |
| 207 ~UNICHARSET(); | |
| 208 | |
| 209 // Return the UNICHAR_ID of a given unichar representation within the | |
| 210 // UNICHARSET. | |
| 211 UNICHAR_ID unichar_to_id(const char *const unichar_repr) const; | |
| 212 | |
| 213 // Return the UNICHAR_ID of a given unichar representation within the | |
| 214 // UNICHARSET. Only the first length characters from unichar_repr are used. | |
| 215 UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const; | |
| 216 | |
| 217 // Return the minimum number of bytes that matches a legal UNICHAR_ID, | |
| 218 // while leaving the rest of the string encodable. Returns 0 if the | |
| 219 // beginning of the string is not encodable. | |
| 220 // WARNING: this function now encodes the whole string for precision. | |
| 221 // Use encode_string in preference to repeatedly calling step. | |
| 222 int step(const char *str) const; | |
| 223 | |
| 224 // Returns true if the given UTF-8 string is encodable with this UNICHARSET. | |
| 225 // If not encodable, write the first byte offset which cannot be converted | |
| 226 // into the second (return) argument. | |
| 227 bool encodable_string(const char *str, unsigned *first_bad_position) const; | |
| 228 | |
| 229 // Encodes the given UTF-8 string with this UNICHARSET. | |
| 230 // Any part of the string that cannot be encoded (because the utf8 can't | |
| 231 // be broken up into pieces that are in the unicharset) then: | |
| 232 // if give_up_on_failure, stops and returns a partial encoding, | |
| 233 // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding. | |
| 234 // Returns true if the encoding succeeds completely, false if there is at | |
| 235 // least one failure. | |
| 236 // If lengths is not nullptr, then it is filled with the corresponding | |
| 237 // byte length of each encoded UNICHAR_ID. | |
| 238 // If encoded_length is not nullptr then on return it contains the length of | |
| 239 // str that was encoded. (if give_up_on_failure the location of the first | |
| 240 // failure, otherwise strlen(str).) | |
| 241 // WARNING: Caller must guarantee that str has already been cleaned of codes | |
| 242 // that do not belong in the unicharset, or encoding may fail. | |
| 243 // Use CleanupString to perform the cleaning. | |
| 244 bool encode_string(const char *str, bool give_up_on_failure, | |
| 245 std::vector<UNICHAR_ID> *encoding, | |
| 246 std::vector<char> *lengths, | |
| 247 unsigned *encoded_length) const; | |
| 248 | |
| 249 // Return the unichar representation corresponding to the given UNICHAR_ID | |
| 250 // within the UNICHARSET. | |
| 251 const char *id_to_unichar(UNICHAR_ID id) const; | |
| 252 | |
| 253 // Return the UTF8 representation corresponding to the given UNICHAR_ID after | |
| 254 // resolving any private encodings internal to Tesseract. This method is | |
| 255 // preferable to id_to_unichar for outputting text that will be visible to | |
| 256 // external applications. | |
| 257 const char *id_to_unichar_ext(UNICHAR_ID id) const; | |
| 258 | |
| 259 // Return a string that reformats the utf8 str into the str followed | |
| 260 // by its hex unicodes. | |
| 261 static std::string debug_utf8_str(const char *str); | |
| 262 | |
| 263 // Removes/replaces content that belongs in rendered text, but not in the | |
| 264 // unicharset. | |
| 265 static std::string CleanupString(const char *utf8_str) { | |
| 266 return CleanupString(utf8_str, strlen(utf8_str)); | |
| 267 } | |
| 268 static std::string CleanupString(const char *utf8_str, size_t length); | |
| 269 | |
| 270 // Return a string containing debug information on the unichar, including | |
| 271 // the id_to_unichar, its hex unicodes and the properties. | |
| 272 std::string debug_str(UNICHAR_ID id) const; | |
| 273 std::string debug_str(const char *unichar_repr) const { | |
| 274 return debug_str(unichar_to_id(unichar_repr)); | |
| 275 } | |
| 276 | |
| 277 // Adds a unichar representation to the set. If old_style is true, then | |
| 278 // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL | |
| 279 // characters are ignored/skipped as if they don't exist and n-grams that | |
| 280 // can already be encoded are not added. | |
| 281 void unichar_insert(const char *const unichar_repr, | |
| 282 OldUncleanUnichars old_style); | |
| 283 void unichar_insert(const char *const unichar_repr) { | |
| 284 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); | |
| 285 } | |
| 286 // Adds a unichar representation to the set. Avoids setting old_style to true, | |
| 287 // unless it is necessary to make the new unichar get added. | |
| 288 void unichar_insert_backwards_compatible(const char *const unichar_repr) { | |
| 289 std::string cleaned = CleanupString(unichar_repr); | |
| 290 if (cleaned != unichar_repr) { | |
| 291 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); | |
| 292 } else { | |
| 293 auto old_size = size(); | |
| 294 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); | |
| 295 if (size() == old_size) { | |
| 296 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); | |
| 297 } | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 // Return true if the given unichar id exists within the set. | |
| 302 // Relies on the fact that unichar ids are contiguous in the unicharset. | |
| 303 bool contains_unichar_id(UNICHAR_ID unichar_id) const { | |
| 304 return static_cast<size_t>(unichar_id) < unichars.size(); | |
| 305 } | |
| 306 | |
| 307 // Return true if the given unichar representation exists within the set. | |
| 308 bool contains_unichar(const char *const unichar_repr) const; | |
| 309 bool contains_unichar(const char *const unichar_repr, int length) const; | |
| 310 | |
| 311 // Return true if the given unichar representation corresponds to the given | |
| 312 // UNICHAR_ID within the set. | |
| 313 bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const; | |
| 314 | |
| 315 // Delete CHAR_FRAGMENTs stored in properties of unichars array. | |
| 316 void delete_pointers_in_unichars() { | |
| 317 for (auto &unichar : unichars) { | |
| 318 delete unichar.properties.fragment; | |
| 319 unichar.properties.fragment = nullptr; | |
| 320 } | |
| 321 } | |
| 322 | |
| 323 // Clear the UNICHARSET (all the previous data is lost). | |
| 324 void clear() { | |
| 325 if (script_table != nullptr) { | |
| 326 for (int i = 0; i < script_table_size_used; ++i) { | |
| 327 delete[] script_table[i]; | |
| 328 } | |
| 329 delete[] script_table; | |
| 330 script_table = nullptr; | |
| 331 script_table_size_used = 0; | |
| 332 } | |
| 333 script_table_size_reserved = 0; | |
| 334 delete_pointers_in_unichars(); | |
| 335 unichars.clear(); | |
| 336 ids.clear(); | |
| 337 top_bottom_set_ = false; | |
| 338 script_has_upper_lower_ = false; | |
| 339 script_has_xheight_ = false; | |
| 340 old_style_included_ = false; | |
| 341 null_sid_ = 0; | |
| 342 common_sid_ = 0; | |
| 343 latin_sid_ = 0; | |
| 344 cyrillic_sid_ = 0; | |
| 345 greek_sid_ = 0; | |
| 346 han_sid_ = 0; | |
| 347 hiragana_sid_ = 0; | |
| 348 katakana_sid_ = 0; | |
| 349 thai_sid_ = 0; | |
| 350 hangul_sid_ = 0; | |
| 351 default_sid_ = 0; | |
| 352 } | |
| 353 | |
| 354 // Return the size of the set (the number of different UNICHAR it holds). | |
| 355 size_t size() const { | |
| 356 return unichars.size(); | |
| 357 } | |
| 358 | |
| 359 // Opens the file indicated by filename and saves unicharset to that file. | |
| 360 // Returns true if the operation is successful. | |
| 361 bool save_to_file(const char *const filename) const { | |
| 362 FILE *file = fopen(filename, "w+b"); | |
| 363 if (file == nullptr) { | |
| 364 return false; | |
| 365 } | |
| 366 bool result = save_to_file(file); | |
| 367 fclose(file); | |
| 368 return result; | |
| 369 } | |
| 370 | |
| 371 // Saves the content of the UNICHARSET to the given file. | |
| 372 // Returns true if the operation is successful. | |
| 373 bool save_to_file(FILE *file) const { | |
| 374 std::string str; | |
| 375 return save_to_string(str) && | |
| 376 tesseract::Serialize(file, &str[0], str.length()); | |
| 377 } | |
| 378 | |
| 379 bool save_to_file(tesseract::TFile *file) const { | |
| 380 std::string str; | |
| 381 return save_to_string(str) && file->Serialize(&str[0], str.length()); | |
| 382 } | |
| 383 | |
| 384 // Saves the content of the UNICHARSET to the given string. | |
| 385 // Returns true if the operation is successful. | |
| 386 bool save_to_string(std::string &str) const; | |
| 387 | |
| 388 // Opens the file indicated by filename and loads the UNICHARSET | |
| 389 // from the given file. The previous data is lost. | |
| 390 // Returns true if the operation is successful. | |
| 391 bool load_from_file(const char *const filename, bool skip_fragments) { | |
| 392 FILE *file = fopen(filename, "rb"); | |
| 393 if (file == nullptr) { | |
| 394 return false; | |
| 395 } | |
| 396 bool result = load_from_file(file, skip_fragments); | |
| 397 fclose(file); | |
| 398 return result; | |
| 399 } | |
| 400 // returns true if the operation is successful. | |
| 401 bool load_from_file(const char *const filename) { | |
| 402 return load_from_file(filename, false); | |
| 403 } | |
| 404 | |
| 405 // Loads the UNICHARSET from the given file. The previous data is lost. | |
| 406 // Returns true if the operation is successful. | |
| 407 bool load_from_file(FILE *file, bool skip_fragments); | |
| 408 bool load_from_file(FILE *file) { | |
| 409 return load_from_file(file, false); | |
| 410 } | |
| 411 bool load_from_file(tesseract::TFile *file, bool skip_fragments); | |
| 412 | |
| 413 // Sets up internal data after loading the file, based on the char | |
| 414 // properties. Called from load_from_file, but also needs to be run | |
| 415 // during set_unicharset_properties. | |
| 416 void post_load_setup(); | |
| 417 | |
| 418 // Returns true if right_to_left scripts are significant in the unicharset, | |
| 419 // but without being so sensitive that "universal" unicharsets containing | |
| 420 // characters from many scripts, like orientation and script detection, | |
| 421 // look like they are right_to_left. | |
| 422 bool major_right_to_left() const; | |
| 423 | |
| 424 // Set a whitelist and/or blacklist of characters to recognize. | |
| 425 // An empty or nullptr whitelist enables everything (minus any blacklist). | |
| 426 // An empty or nullptr blacklist disables nothing. | |
| 427 // An empty or nullptr unblacklist has no effect. | |
| 428 // The blacklist overrides the whitelist. | |
| 429 // The unblacklist overrides the blacklist. | |
| 430 // Each list is a string of utf8 character strings. Boundaries between | |
| 431 // unicharset units are worked out automatically, and characters not in | |
| 432 // the unicharset are silently ignored. | |
| 433 void set_black_and_whitelist(const char *blacklist, const char *whitelist, | |
| 434 const char *unblacklist); | |
| 435 | |
| 436 // Set the isalpha property of the given unichar to the given value. | |
| 437 void set_isalpha(UNICHAR_ID unichar_id, bool value) { | |
| 438 unichars[unichar_id].properties.isalpha = value; | |
| 439 } | |
| 440 | |
| 441 // Set the islower property of the given unichar to the given value. | |
| 442 void set_islower(UNICHAR_ID unichar_id, bool value) { | |
| 443 unichars[unichar_id].properties.islower = value; | |
| 444 } | |
| 445 | |
| 446 // Set the isupper property of the given unichar to the given value. | |
| 447 void set_isupper(UNICHAR_ID unichar_id, bool value) { | |
| 448 unichars[unichar_id].properties.isupper = value; | |
| 449 } | |
| 450 | |
| 451 // Set the isdigit property of the given unichar to the given value. | |
| 452 void set_isdigit(UNICHAR_ID unichar_id, bool value) { | |
| 453 unichars[unichar_id].properties.isdigit = value; | |
| 454 } | |
| 455 | |
| 456 // Set the ispunctuation property of the given unichar to the given value. | |
| 457 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { | |
| 458 unichars[unichar_id].properties.ispunctuation = value; | |
| 459 } | |
| 460 | |
| 461 // Set the isngram property of the given unichar to the given value. | |
| 462 void set_isngram(UNICHAR_ID unichar_id, bool value) { | |
| 463 unichars[unichar_id].properties.isngram = value; | |
| 464 } | |
| 465 | |
| 466 // Set the script name of the given unichar to the given value. | |
| 467 // Value is copied and thus can be a temporary; | |
| 468 void set_script(UNICHAR_ID unichar_id, const char *value) { | |
| 469 unichars[unichar_id].properties.script_id = add_script(value); | |
| 470 } | |
| 471 | |
| 472 // Set other_case unichar id in the properties for the given unichar id. | |
| 473 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { | |
| 474 unichars[unichar_id].properties.other_case = other_case; | |
| 475 } | |
| 476 | |
| 477 // Set the direction property of the given unichar to the given value. | |
| 478 void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { | |
| 479 unichars[unichar_id].properties.direction = value; | |
| 480 } | |
| 481 | |
| 482 // Set mirror unichar id in the properties for the given unichar id. | |
| 483 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { | |
| 484 unichars[unichar_id].properties.mirror = mirror; | |
| 485 } | |
| 486 | |
| 487 // Record normalized version of unichar with the given unichar_id. | |
| 488 void set_normed(UNICHAR_ID unichar_id, const char *normed) { | |
| 489 unichars[unichar_id].properties.normed = normed; | |
| 490 unichars[unichar_id].properties.normed_ids.clear(); | |
| 491 } | |
| 492 // Sets the normed_ids vector from the normed string. normed_ids is not | |
| 493 // stored in the file, and needs to be set when the UNICHARSET is loaded. | |
| 494 void set_normed_ids(UNICHAR_ID unichar_id); | |
| 495 | |
| 496 // Return the isalpha property of the given unichar. | |
| 497 bool get_isalpha(UNICHAR_ID unichar_id) const { | |
| 498 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 499 return false; | |
| 500 } | |
| 501 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 502 return unichars[unichar_id].properties.isalpha; | |
| 503 } | |
| 504 | |
| 505 // Return the islower property of the given unichar. | |
| 506 bool get_islower(UNICHAR_ID unichar_id) const { | |
| 507 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 508 return false; | |
| 509 } | |
| 510 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 511 return unichars[unichar_id].properties.islower; | |
| 512 } | |
| 513 | |
| 514 // Return the isupper property of the given unichar. | |
| 515 bool get_isupper(UNICHAR_ID unichar_id) const { | |
| 516 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 517 return false; | |
| 518 } | |
| 519 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 520 return unichars[unichar_id].properties.isupper; | |
| 521 } | |
| 522 | |
| 523 // Return the isdigit property of the given unichar. | |
| 524 bool get_isdigit(UNICHAR_ID unichar_id) const { | |
| 525 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 526 return false; | |
| 527 } | |
| 528 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 529 return unichars[unichar_id].properties.isdigit; | |
| 530 } | |
| 531 | |
| 532 // Return the ispunctuation property of the given unichar. | |
| 533 bool get_ispunctuation(UNICHAR_ID unichar_id) const { | |
| 534 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 535 return false; | |
| 536 } | |
| 537 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 538 return unichars[unichar_id].properties.ispunctuation; | |
| 539 } | |
| 540 | |
| 541 // Return the isngram property of the given unichar. | |
| 542 bool get_isngram(UNICHAR_ID unichar_id) const { | |
| 543 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 544 return false; | |
| 545 } | |
| 546 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 547 return unichars[unichar_id].properties.isngram; | |
| 548 } | |
| 549 | |
| 550 // Returns whether the unichar id represents a unicode value in the private | |
| 551 // use area. | |
| 552 bool get_isprivate(UNICHAR_ID unichar_id) const; | |
| 553 | |
| 554 // Returns true if the ids have useful min/max top/bottom values. | |
| 555 bool top_bottom_useful() const { | |
| 556 return top_bottom_set_; | |
| 557 } | |
| 558 // Sets all ranges to empty, so they can be expanded to set the values. | |
| 559 void set_ranges_empty(); | |
| 560 // Sets all the properties for this unicharset given a src_unicharset with | |
| 561 // everything set. The unicharsets don't have to be the same, and graphemes | |
| 562 // are correctly accounted for. | |
| 563 void SetPropertiesFromOther(const UNICHARSET &src) { | |
| 564 PartialSetPropertiesFromOther(0, src); | |
| 565 } | |
| 566 // Sets properties from Other, starting only at the given index. | |
| 567 void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src); | |
| 568 // Expands the tops and bottoms and widths for this unicharset given a | |
| 569 // src_unicharset with ranges in it. The unicharsets don't have to be the | |
| 570 // same, and graphemes are correctly accounted for. | |
| 571 void ExpandRangesFromOther(const UNICHARSET &src); | |
| 572 // Makes this a copy of src. Clears this completely first, so the automattic | |
| 573 // ids will not be present in this if not in src. | |
| 574 void CopyFrom(const UNICHARSET &src); | |
| 575 // For each id in src, if it does not occur in this, add it, as in | |
| 576 // SetPropertiesFromOther, otherwise expand the ranges, as in | |
| 577 // ExpandRangesFromOther. | |
| 578 void AppendOtherUnicharset(const UNICHARSET &src); | |
| 579 // Returns true if the acceptable ranges of the tops of the characters do | |
| 580 // not overlap, making their x-height calculations distinct. | |
| 581 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const; | |
| 582 // Returns the min and max bottom and top of the given unichar in | |
| 583 // baseline-normalized coordinates, ie, where the baseline is | |
| 584 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight | |
| 585 // (See normalis.h for the definitions). | |
| 586 void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, | |
| 587 int *min_top, int *max_top) const { | |
| 588 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 589 *min_bottom = *min_top = 0; | |
| 590 *max_bottom = *max_top = 256; // kBlnCellHeight | |
| 591 return; | |
| 592 } | |
| 593 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 594 *min_bottom = unichars[unichar_id].properties.min_bottom; | |
| 595 *max_bottom = unichars[unichar_id].properties.max_bottom; | |
| 596 *min_top = unichars[unichar_id].properties.min_top; | |
| 597 *max_top = unichars[unichar_id].properties.max_top; | |
| 598 } | |
| 599 void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, | |
| 600 int min_top, int max_top) { | |
| 601 unichars[unichar_id].properties.min_bottom = | |
| 602 ClipToRange<int>(min_bottom, 0, UINT8_MAX); | |
| 603 unichars[unichar_id].properties.max_bottom = | |
| 604 ClipToRange<int>(max_bottom, 0, UINT8_MAX); | |
| 605 unichars[unichar_id].properties.min_top = | |
| 606 ClipToRange<int>(min_top, 0, UINT8_MAX); | |
| 607 unichars[unichar_id].properties.max_top = | |
| 608 ClipToRange<int>(max_top, 0, UINT8_MAX); | |
| 609 } | |
| 610 // Returns the width stats (as mean, sd) of the given unichar relative to the | |
| 611 // median advance of all characters in the character set. | |
| 612 void get_width_stats(UNICHAR_ID unichar_id, float *width, | |
| 613 float *width_sd) const { | |
| 614 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 615 *width = 0.0f; | |
| 616 *width_sd = 0.0f; | |
| 617 return; | |
| 618 } | |
| 619 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 620 *width = unichars[unichar_id].properties.width; | |
| 621 *width_sd = unichars[unichar_id].properties.width_sd; | |
| 622 } | |
| 623 void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) { | |
| 624 unichars[unichar_id].properties.width = width; | |
| 625 unichars[unichar_id].properties.width_sd = width_sd; | |
| 626 } | |
| 627 // Returns the stats of the x-bearing (as mean, sd) of the given unichar | |
| 628 // relative to the median advance of all characters in the character set. | |
| 629 void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, | |
| 630 float *bearing_sd) const { | |
| 631 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 632 *bearing = *bearing_sd = 0.0f; | |
| 633 return; | |
| 634 } | |
| 635 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 636 *bearing = unichars[unichar_id].properties.bearing; | |
| 637 *bearing_sd = unichars[unichar_id].properties.bearing_sd; | |
| 638 } | |
| 639 void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, | |
| 640 float bearing_sd) { | |
| 641 unichars[unichar_id].properties.bearing = bearing; | |
| 642 unichars[unichar_id].properties.bearing_sd = bearing_sd; | |
| 643 } | |
| 644 // Returns the stats of the x-advance of the given unichar (as mean, sd) | |
| 645 // relative to the median advance of all characters in the character set. | |
| 646 void get_advance_stats(UNICHAR_ID unichar_id, float *advance, | |
| 647 float *advance_sd) const { | |
| 648 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 649 *advance = *advance_sd = 0; | |
| 650 return; | |
| 651 } | |
| 652 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 653 *advance = unichars[unichar_id].properties.advance; | |
| 654 *advance_sd = unichars[unichar_id].properties.advance_sd; | |
| 655 } | |
| 656 void set_advance_stats(UNICHAR_ID unichar_id, float advance, | |
| 657 float advance_sd) { | |
| 658 unichars[unichar_id].properties.advance = advance; | |
| 659 unichars[unichar_id].properties.advance_sd = advance_sd; | |
| 660 } | |
| 661 // Returns true if the font metrics properties are empty. | |
| 662 bool PropertiesIncomplete(UNICHAR_ID unichar_id) const { | |
| 663 return unichars[unichar_id].properties.AnyRangeEmpty(); | |
| 664 } | |
| 665 | |
| 666 // Returns true if the script of the given id is space delimited. | |
| 667 // Returns false for Han and Thai scripts. | |
| 668 bool IsSpaceDelimited(UNICHAR_ID unichar_id) const { | |
| 669 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 670 return true; | |
| 671 } | |
| 672 int script_id = get_script(unichar_id); | |
| 673 return script_id != han_sid_ && script_id != thai_sid_ && | |
| 674 script_id != hangul_sid_ && script_id != hiragana_sid_ && | |
| 675 script_id != katakana_sid_; | |
| 676 } | |
| 677 | |
| 678 // Return the script name of the given unichar. | |
| 679 // The returned pointer will always be the same for the same script, it's | |
| 680 // managed by unicharset and thus MUST NOT be deleted | |
| 681 int get_script(UNICHAR_ID unichar_id) const { | |
| 682 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 683 return null_sid_; | |
| 684 } | |
| 685 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 686 return unichars[unichar_id].properties.script_id; | |
| 687 } | |
| 688 | |
| 689 // Return the character properties, eg. alpha/upper/lower/digit/punct, | |
| 690 // as a bit field of unsigned int. | |
| 691 unsigned int get_properties(UNICHAR_ID unichar_id) const; | |
| 692 | |
| 693 // Return the character property as a single char. If a character has | |
| 694 // multiple attributes, the main property is defined by the following order: | |
| 695 // upper_case : 'A' | |
| 696 // lower_case : 'a' | |
| 697 // alpha : 'x' | |
| 698 // digit : '0' | |
| 699 // punctuation: 'p' | |
| 700 char get_chartype(UNICHAR_ID unichar_id) const; | |
| 701 | |
| 702 // Get other_case unichar id in the properties for the given unichar id. | |
| 703 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { | |
| 704 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 705 return INVALID_UNICHAR_ID; | |
| 706 } | |
| 707 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 708 return unichars[unichar_id].properties.other_case; | |
| 709 } | |
| 710 | |
| 711 // Returns the direction property of the given unichar. | |
| 712 Direction get_direction(UNICHAR_ID unichar_id) const { | |
| 713 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 714 return UNICHARSET::U_OTHER_NEUTRAL; | |
| 715 } | |
| 716 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 717 return unichars[unichar_id].properties.direction; | |
| 718 } | |
| 719 | |
| 720 // Get mirror unichar id in the properties for the given unichar id. | |
| 721 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { | |
| 722 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 723 return INVALID_UNICHAR_ID; | |
| 724 } | |
| 725 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 726 return unichars[unichar_id].properties.mirror; | |
| 727 } | |
| 728 | |
| 729 // Returns UNICHAR_ID of the corresponding lower-case unichar. | |
| 730 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { | |
| 731 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 732 return INVALID_UNICHAR_ID; | |
| 733 } | |
| 734 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 735 if (unichars[unichar_id].properties.islower) { | |
| 736 return unichar_id; | |
| 737 } | |
| 738 return unichars[unichar_id].properties.other_case; | |
| 739 } | |
| 740 | |
| 741 // Returns UNICHAR_ID of the corresponding upper-case unichar. | |
| 742 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { | |
| 743 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 744 return INVALID_UNICHAR_ID; | |
| 745 } | |
| 746 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 747 if (unichars[unichar_id].properties.isupper) { | |
| 748 return unichar_id; | |
| 749 } | |
| 750 return unichars[unichar_id].properties.other_case; | |
| 751 } | |
| 752 | |
| 753 // Returns true if this UNICHARSET has the special codes in | |
| 754 // SpecialUnicharCodes available. If false then there are normal unichars | |
| 755 // at these codes and they should not be used. | |
| 756 bool has_special_codes() const { | |
| 757 return get_fragment(UNICHAR_BROKEN) != nullptr && | |
| 758 strcmp(id_to_unichar(UNICHAR_BROKEN), | |
| 759 kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; | |
| 760 } | |
| 761 | |
| 762 // Returns true if there are any repeated unicodes in the normalized | |
| 763 // text of any unichar-id in the unicharset. | |
| 764 bool AnyRepeatedUnicodes() const; | |
| 765 | |
| 766 // Return a pointer to the CHAR_FRAGMENT class if the given | |
| 767 // unichar id represents a character fragment. | |
| 768 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { | |
| 769 if (INVALID_UNICHAR_ID == unichar_id) { | |
| 770 return nullptr; | |
| 771 } | |
| 772 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 773 return unichars[unichar_id].properties.fragment; | |
| 774 } | |
| 775 | |
| 776 // Return the isalpha property of the given unichar representation. | |
| 777 bool get_isalpha(const char *const unichar_repr) const { | |
| 778 return get_isalpha(unichar_to_id(unichar_repr)); | |
| 779 } | |
| 780 | |
| 781 // Return the islower property of the given unichar representation. | |
| 782 bool get_islower(const char *const unichar_repr) const { | |
| 783 return get_islower(unichar_to_id(unichar_repr)); | |
| 784 } | |
| 785 | |
| 786 // Return the isupper property of the given unichar representation. | |
| 787 bool get_isupper(const char *const unichar_repr) const { | |
| 788 return get_isupper(unichar_to_id(unichar_repr)); | |
| 789 } | |
| 790 | |
| 791 // Return the isdigit property of the given unichar representation. | |
| 792 bool get_isdigit(const char *const unichar_repr) const { | |
| 793 return get_isdigit(unichar_to_id(unichar_repr)); | |
| 794 } | |
| 795 | |
| 796 // Return the ispunctuation property of the given unichar representation. | |
| 797 bool get_ispunctuation(const char *const unichar_repr) const { | |
| 798 return get_ispunctuation(unichar_to_id(unichar_repr)); | |
| 799 } | |
| 800 | |
| 801 // Return the character properties, eg. alpha/upper/lower/digit/punct, | |
| 802 // of the given unichar representation | |
| 803 unsigned int get_properties(const char *const unichar_repr) const { | |
| 804 return get_properties(unichar_to_id(unichar_repr)); | |
| 805 } | |
| 806 | |
| 807 char get_chartype(const char *const unichar_repr) const { | |
| 808 return get_chartype(unichar_to_id(unichar_repr)); | |
| 809 } | |
| 810 | |
| 811 // Return the script name of the given unichar representation. | |
| 812 // The returned pointer will always be the same for the same script, it's | |
| 813 // managed by unicharset and thus MUST NOT be deleted | |
| 814 int get_script(const char *const unichar_repr) const { | |
| 815 return get_script(unichar_to_id(unichar_repr)); | |
| 816 } | |
| 817 | |
| 818 // Return a pointer to the CHAR_FRAGMENT class struct if the given | |
| 819 // unichar representation represents a character fragment. | |
| 820 const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const { | |
| 821 if (unichar_repr == nullptr || unichar_repr[0] == '\0' || | |
| 822 !ids.contains(unichar_repr, false)) { | |
| 823 return nullptr; | |
| 824 } | |
| 825 return get_fragment(unichar_to_id(unichar_repr)); | |
| 826 } | |
| 827 | |
| 828 // Return the isalpha property of the given unichar representation. | |
| 829 // Only the first length characters from unichar_repr are used. | |
| 830 bool get_isalpha(const char *const unichar_repr, int length) const { | |
| 831 return get_isalpha(unichar_to_id(unichar_repr, length)); | |
| 832 } | |
| 833 | |
| 834 // Return the islower property of the given unichar representation. | |
| 835 // Only the first length characters from unichar_repr are used. | |
| 836 bool get_islower(const char *const unichar_repr, int length) const { | |
| 837 return get_islower(unichar_to_id(unichar_repr, length)); | |
| 838 } | |
| 839 | |
| 840 // Return the isupper property of the given unichar representation. | |
| 841 // Only the first length characters from unichar_repr are used. | |
| 842 bool get_isupper(const char *const unichar_repr, int length) const { | |
| 843 return get_isupper(unichar_to_id(unichar_repr, length)); | |
| 844 } | |
| 845 | |
| 846 // Return the isdigit property of the given unichar representation. | |
| 847 // Only the first length characters from unichar_repr are used. | |
| 848 bool get_isdigit(const char *const unichar_repr, int length) const { | |
| 849 return get_isdigit(unichar_to_id(unichar_repr, length)); | |
| 850 } | |
| 851 | |
| 852 // Return the ispunctuation property of the given unichar representation. | |
| 853 // Only the first length characters from unichar_repr are used. | |
| 854 bool get_ispunctuation(const char *const unichar_repr, int length) const { | |
| 855 return get_ispunctuation(unichar_to_id(unichar_repr, length)); | |
| 856 } | |
| 857 | |
| 858 // Returns normalized version of unichar with the given unichar_id. | |
| 859 const char *get_normed_unichar(UNICHAR_ID unichar_id) const { | |
| 860 if (unichar_id == UNICHAR_SPACE) { | |
| 861 return " "; | |
| 862 } | |
| 863 return unichars[unichar_id].properties.normed.c_str(); | |
| 864 } | |
| 865 // Returns a vector of UNICHAR_IDs that represent the ids of the normalized | |
| 866 // version of the given id. There may be more than one UNICHAR_ID in the | |
| 867 // vector if unichar_id represents a ligature. | |
| 868 const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const { | |
| 869 return unichars[unichar_id].properties.normed_ids; | |
| 870 } | |
| 871 | |
| 872 // Return the script name of the given unichar representation. | |
| 873 // Only the first length characters from unichar_repr are used. | |
| 874 // The returned pointer will always be the same for the same script, it's | |
| 875 // managed by unicharset and thus MUST NOT be deleted | |
| 876 int get_script(const char *const unichar_repr, int length) const { | |
| 877 return get_script(unichar_to_id(unichar_repr, length)); | |
| 878 } | |
| 879 | |
| 880 // Return the (current) number of scripts in the script table | |
| 881 int get_script_table_size() const { | |
| 882 return script_table_size_used; | |
| 883 } | |
| 884 | |
| 885 // Return the script string from its id | |
| 886 const char *get_script_from_script_id(int id) const { | |
| 887 if (id >= script_table_size_used || id < 0) { | |
| 888 return null_script; | |
| 889 } | |
| 890 return script_table[id]; | |
| 891 } | |
| 892 | |
| 893 // Returns the id from the name of the script, or 0 if script is not found. | |
| 894 // Note that this is an expensive operation since it involves iteratively | |
| 895 // comparing strings in the script table. To avoid dependency on STL, we | |
| 896 // won't use a hash. Instead, the calling function can use this to lookup | |
| 897 // and save the ID for relevant scripts for fast comparisons later. | |
| 898 int get_script_id_from_name(const char *script_name) const; | |
| 899 | |
| 900 // Return true if the given script is the null script | |
| 901 bool is_null_script(const char *script) const { | |
| 902 return script == null_script; | |
| 903 } | |
| 904 | |
| 905 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, | |
| 906 // then the returned pointer will be the same. | |
| 907 // The script parameter is copied and thus can be a temporary. | |
| 908 int add_script(const char *script); | |
| 909 | |
| 910 // Return the enabled property of the given unichar. | |
| 911 bool get_enabled(UNICHAR_ID unichar_id) const { | |
| 912 ASSERT_HOST(contains_unichar_id(unichar_id)); | |
| 913 return unichars[unichar_id].properties.enabled; | |
| 914 } | |
| 915 | |
| 916 int null_sid() const { | |
| 917 return null_sid_; | |
| 918 } | |
| 919 int common_sid() const { | |
| 920 return common_sid_; | |
| 921 } | |
| 922 int latin_sid() const { | |
| 923 return latin_sid_; | |
| 924 } | |
| 925 int cyrillic_sid() const { | |
| 926 return cyrillic_sid_; | |
| 927 } | |
| 928 int greek_sid() const { | |
| 929 return greek_sid_; | |
| 930 } | |
| 931 int han_sid() const { | |
| 932 return han_sid_; | |
| 933 } | |
| 934 int hiragana_sid() const { | |
| 935 return hiragana_sid_; | |
| 936 } | |
| 937 int katakana_sid() const { | |
| 938 return katakana_sid_; | |
| 939 } | |
| 940 int thai_sid() const { | |
| 941 return thai_sid_; | |
| 942 } | |
| 943 int hangul_sid() const { | |
| 944 return hangul_sid_; | |
| 945 } | |
| 946 int default_sid() const { | |
| 947 return default_sid_; | |
| 948 } | |
| 949 | |
| 950 // Returns true if the unicharset has the concept of upper/lower case. | |
| 951 bool script_has_upper_lower() const { | |
| 952 return script_has_upper_lower_; | |
| 953 } | |
| 954 // Returns true if the unicharset has the concept of x-height. | |
| 955 // script_has_xheight can be true even if script_has_upper_lower is not, | |
| 956 // when the script has a sufficiently predominant top line with ascenders, | |
| 957 // such as Devanagari and Thai. | |
| 958 bool script_has_xheight() const { | |
| 959 return script_has_xheight_; | |
| 960 } | |
| 961 | |
| 962 private: | |
| 963 struct TESS_API UNICHAR_PROPERTIES { | |
| 964 UNICHAR_PROPERTIES(); | |
| 965 // Initializes all properties to sensible default values. | |
| 966 void Init(); | |
| 967 // Sets all ranges wide open. Initialization default in case there are | |
| 968 // no useful values available. | |
| 969 void SetRangesOpen(); | |
| 970 // Sets all ranges to empty. Used before expanding with font-based data. | |
| 971 void SetRangesEmpty(); | |
| 972 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats | |
| 973 // is empty. | |
| 974 bool AnyRangeEmpty() const; | |
| 975 // Expands the ranges with the ranges from the src properties. | |
| 976 void ExpandRangesFrom(const UNICHAR_PROPERTIES &src); | |
| 977 // Copies the properties from src into this. | |
| 978 void CopyFrom(const UNICHAR_PROPERTIES &src); | |
| 979 | |
| 980 bool isalpha; | |
| 981 bool islower; | |
| 982 bool isupper; | |
| 983 bool isdigit; | |
| 984 bool ispunctuation; | |
| 985 bool isngram; | |
| 986 bool enabled; | |
| 987 // Possible limits of the top and bottom of the bounding box in | |
| 988 // baseline-normalized coordinates, ie, where the baseline is | |
| 989 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight | |
| 990 // (See normalis.h for the definitions). | |
| 991 uint8_t min_bottom; | |
| 992 uint8_t max_bottom; | |
| 993 uint8_t min_top; | |
| 994 uint8_t max_top; | |
| 995 // Statistics of the widths of bounding box, relative to the median advance. | |
| 996 float width; | |
| 997 float width_sd; | |
| 998 // Stats of the x-bearing and advance, also relative to the median advance. | |
| 999 float bearing; | |
| 1000 float bearing_sd; | |
| 1001 float advance; | |
| 1002 float advance_sd; | |
| 1003 int script_id; | |
| 1004 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar | |
| 1005 Direction direction; // direction of this unichar | |
| 1006 // Mirror property is useful for reverse DAWG lookup for words in | |
| 1007 // right-to-left languages (e.g. "(word)" would be in | |
| 1008 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. | |
| 1009 // However, what we want in our DAWG is | |
| 1010 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not | |
| 1011 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. | |
| 1012 UNICHAR_ID mirror; | |
| 1013 // A string of unichar_ids that represent the corresponding normed string. | |
| 1014 // For awkward characters like em-dash, this gives hyphen. | |
| 1015 // For ligatures, this gives the string of normal unichars. | |
| 1016 std::vector<UNICHAR_ID> normed_ids; | |
| 1017 std::string normed; // normalized version of this unichar | |
| 1018 // Contains meta information about the fragment if a unichar represents | |
| 1019 // a fragment of a character, otherwise should be set to nullptr. | |
| 1020 // It is assumed that character fragments are added to the unicharset | |
| 1021 // after the corresponding 'base' characters. | |
| 1022 CHAR_FRAGMENT *fragment; | |
| 1023 }; | |
| 1024 | |
| 1025 struct UNICHAR_SLOT { | |
| 1026 char representation[UNICHAR_LEN + 1]; | |
| 1027 UNICHAR_PROPERTIES properties; | |
| 1028 }; | |
| 1029 | |
| 1030 // Internal recursive version of encode_string above. | |
| 1031 // str is the start of the whole string. | |
| 1032 // str_index is the current position in str. | |
| 1033 // str_length is the length of str. | |
| 1034 // encoding is a working encoding of str. | |
| 1035 // lengths is a working set of lengths of each element of encoding. | |
| 1036 // best_total_length is the longest length of str that has been successfully | |
| 1037 // encoded so far. | |
| 1038 // On return: | |
| 1039 // best_encoding contains the encoding that used the longest part of str. | |
| 1040 // best_lengths (may be null) contains the lengths of best_encoding. | |
| 1041 void encode_string(const char *str, int str_index, int str_length, | |
| 1042 std::vector<UNICHAR_ID> *encoding, | |
| 1043 std::vector<char> *lengths, unsigned *best_total_length, | |
| 1044 std::vector<UNICHAR_ID> *best_encoding, | |
| 1045 std::vector<char> *best_lengths) const; | |
| 1046 | |
| 1047 // Gets the properties for a grapheme string, combining properties for | |
| 1048 // multiple characters in a meaningful way where possible. | |
| 1049 // Returns false if no valid match was found in the unicharset. | |
| 1050 // NOTE that script_id, mirror, and other_case refer to this unicharset on | |
| 1051 // return and will need redirecting if the target unicharset is different. | |
| 1052 bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const; | |
| 1053 | |
| 1054 // Load ourselves from a "file" where our only interface to the file is | |
| 1055 // an implementation of fgets(). This is the parsing primitive accessed by | |
| 1056 // the public routines load_from_file(). | |
| 1057 bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb, | |
| 1058 bool skip_fragments); | |
| 1059 | |
| 1060 // List of mappings to make when ingesting strings from the outside. | |
| 1061 // The substitutions clean up text that should exists for rendering of | |
| 1062 // synthetic data, but not in the recognition set. | |
| 1063 static const char *kCleanupMaps[][2]; | |
| 1064 static const char *null_script; | |
| 1065 | |
| 1066 std::vector<UNICHAR_SLOT> unichars; | |
| 1067 UNICHARMAP ids; | |
| 1068 char **script_table; | |
| 1069 int script_table_size_used; | |
| 1070 int script_table_size_reserved; | |
| 1071 // True if the unichars have their tops/bottoms set. | |
| 1072 bool top_bottom_set_; | |
| 1073 // True if the unicharset has significant upper/lower case chars. | |
| 1074 bool script_has_upper_lower_; | |
| 1075 // True if the unicharset has a significant mean-line with significant | |
| 1076 // ascenders above that. | |
| 1077 bool script_has_xheight_; | |
| 1078 // True if the set contains chars that would be changed by the cleanup. | |
| 1079 bool old_style_included_; | |
| 1080 | |
| 1081 // A few convenient script name-to-id mapping without using hash. | |
| 1082 // These are initialized when unicharset file is loaded. Anything | |
| 1083 // missing from this list can be looked up using get_script_id_from_name. | |
| 1084 int null_sid_; | |
| 1085 int common_sid_; | |
| 1086 int latin_sid_; | |
| 1087 int cyrillic_sid_; | |
| 1088 int greek_sid_; | |
| 1089 int han_sid_; | |
| 1090 int hiragana_sid_; | |
| 1091 int katakana_sid_; | |
| 1092 int thai_sid_; | |
| 1093 int hangul_sid_; | |
| 1094 // The most frequently occurring script in the charset. | |
| 1095 int default_sid_; | |
| 1096 }; | |
| 1097 | |
| 1098 } // namespace tesseract | |
| 1099 | |
| 1100 #endif // TESSERACT_CCUTIL_UNICHARSET_H_ |
