Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharcompress.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: unicharcompress.cpp | |
| 3 // Description: Unicode re-encoding using a sequence of smaller numbers in | |
| 4 // place of a single large code for CJK, similarly for Indic, | |
| 5 // and dissection of ligatures for other scripts. | |
| 6 // Author: Ray Smith | |
| 7 // | |
| 8 // (C) Copyright 2015, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 /////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #include "unicharcompress.h" | |
| 22 #include <algorithm> | |
| 23 #include <memory> | |
| 24 #include "tprintf.h" | |
| 25 | |
| 26 namespace tesseract { | |
| 27 | |
| 28 // String used to represent the null_id in direct_set. | |
| 29 static const char *kNullChar = "<nul>"; | |
| 30 // Radix to make unique values from the stored radical codes. | |
| 31 const int kRadicalRadix = 29; | |
| 32 | |
| 33 // "Hash" function for const std::vector<int> computes the sum of elements. | |
| 34 // Build a unique number for each code sequence that we can use as the index in | |
| 35 // a hash map of ints instead of trying to hash the vectors. | |
| 36 static int RadicalPreHash(const std::vector<int> &rs) { | |
| 37 size_t result = 0; | |
| 38 for (int radical : rs) { | |
| 39 result *= kRadicalRadix; | |
| 40 result += radical; | |
| 41 } | |
| 42 return result; | |
| 43 } | |
| 44 | |
| 45 // A hash map to convert unicodes to radical encoding. | |
| 46 using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>; | |
| 47 // A hash map to count occurrences of each radical encoding. | |
| 48 using RSCounts = std::unordered_map<int, int>; | |
| 49 | |
| 50 static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) { | |
| 51 if (radical_data_line.empty() || (radical_data_line)[0] == '#') { | |
| 52 return true; | |
| 53 } | |
| 54 std::vector<std::string> entries = split(radical_data_line, ' '); | |
| 55 if (entries.size() < 2) { | |
| 56 return false; | |
| 57 } | |
| 58 char *end = nullptr; | |
| 59 int unicode = strtol(&entries[0][0], &end, 10); | |
| 60 if (*end != '\0') { | |
| 61 return false; | |
| 62 } | |
| 63 std::unique_ptr<std::vector<int>> radicals(new std::vector<int>); | |
| 64 for (size_t i = 1; i < entries.size(); ++i) { | |
| 65 int radical = strtol(&entries[i][0], &end, 10); | |
| 66 if (*end != '\0') { | |
| 67 return false; | |
| 68 } | |
| 69 radicals->push_back(radical); | |
| 70 } | |
| 71 (*radical_map)[unicode] = std::move(radicals); | |
| 72 return true; | |
| 73 } | |
| 74 | |
| 75 // Helper function builds the RSMap from the radical-stroke file, which has | |
| 76 // already been read into a string. Returns false on error. | |
| 77 // The radical_stroke_table is non-const because it gets split and the caller | |
| 78 // is unlikely to want to use it again. | |
| 79 static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) { | |
| 80 std::vector<std::string> lines = split(radical_data, '\n'); | |
| 81 for (unsigned i = 0; i < lines.size(); ++i) { | |
| 82 if (!DecodeRadicalLine(lines[i], radical_map)) { | |
| 83 tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str()); | |
| 84 return false; | |
| 85 } | |
| 86 } | |
| 87 return true; | |
| 88 } | |
| 89 | |
| 90 UnicharCompress::UnicharCompress() : code_range_(0) {} | |
| 91 UnicharCompress::UnicharCompress(const UnicharCompress &src) { | |
| 92 *this = src; | |
| 93 } | |
| 94 UnicharCompress::~UnicharCompress() { | |
| 95 Cleanup(); | |
| 96 } | |
| 97 UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) { | |
| 98 Cleanup(); | |
| 99 encoder_ = src.encoder_; | |
| 100 code_range_ = src.code_range_; | |
| 101 SetupDecoder(); | |
| 102 return *this; | |
| 103 } | |
| 104 | |
| 105 // Computes the encoding for the given unicharset. It is a requirement that | |
| 106 // the file training/langdata/radical-stroke.txt have been read into the | |
| 107 // input string radical_stroke_table. | |
| 108 // Returns false if the encoding cannot be constructed. | |
| 109 bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id, | |
| 110 std::string *radical_stroke_table) { | |
| 111 RSMap radical_map; | |
| 112 if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) { | |
| 113 return false; | |
| 114 } | |
| 115 encoder_.clear(); | |
| 116 UNICHARSET direct_set; | |
| 117 // To avoid unused codes, clear the special codes from the direct_set. | |
| 118 direct_set.clear(); | |
| 119 // Always keep space as 0; | |
| 120 direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue); | |
| 121 // Null char is next if we have one. | |
| 122 if (null_id >= 0) { | |
| 123 direct_set.unichar_insert(kNullChar); | |
| 124 } | |
| 125 RSCounts radical_counts; | |
| 126 // In the initial map, codes [0, unicharset.size()) are | |
| 127 // reserved for non-han/hangul sequences of 1 or more unicodes. | |
| 128 int hangul_offset = unicharset.size(); | |
| 129 // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos). | |
| 130 const int kTotalJamos = kLCount + kVCount + kTCount; | |
| 131 // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard | |
| 132 // to measure the number of radicals and strokes, initially we use the same | |
| 133 // code range for all 3 Han code positions, and fix them after. | |
| 134 int han_offset = hangul_offset + kTotalJamos; | |
| 135 for (unsigned u = 0; u <= unicharset.size(); ++u) { | |
| 136 // We special-case allow null_id to be equal to unicharset.size() in case | |
| 137 // there is no space in unicharset for it. | |
| 138 if (u == unicharset.size() && static_cast<int>(u) != null_id) { | |
| 139 break; // Finished | |
| 140 } | |
| 141 RecodedCharID code; | |
| 142 // Convert to unicodes. | |
| 143 std::vector<char32> unicodes; | |
| 144 std::string cleaned; | |
| 145 if (u < unicharset.size()) { | |
| 146 cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u)); | |
| 147 } | |
| 148 if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) { | |
| 149 // Check single unicodes for Hangul/Han and encode if so. | |
| 150 int unicode = unicodes[0]; | |
| 151 int leading, vowel, trailing; | |
| 152 auto it = radical_map.find(unicode); | |
| 153 if (it != radical_map.end()) { | |
| 154 // This is Han. Use the radical codes directly. | |
| 155 int num_radicals = it->second->size(); | |
| 156 for (int c = 0; c < num_radicals; ++c) { | |
| 157 code.Set(c, han_offset + (*it->second)[c]); | |
| 158 } | |
| 159 int pre_hash = RadicalPreHash(*it->second); | |
| 160 int num_samples = radical_counts[pre_hash]++; | |
| 161 if (num_samples > 0) { | |
| 162 code.Set(num_radicals, han_offset + num_samples + kRadicalRadix); | |
| 163 } | |
| 164 } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) { | |
| 165 // This is Hangul. Since we know the exact size of each part at compile | |
| 166 // time, it gets the bottom set of codes. | |
| 167 code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset, | |
| 168 trailing + kLCount + kVCount + hangul_offset); | |
| 169 } | |
| 170 } | |
| 171 // If the code is still empty, it wasn't Han or Hangul. | |
| 172 if (code.empty()) { | |
| 173 // Special cases. | |
| 174 if (u == UNICHAR_SPACE) { | |
| 175 code.Set(0, 0); // Space. | |
| 176 } else if (static_cast<int>(u) == null_id || | |
| 177 (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) { | |
| 178 code.Set(0, direct_set.unichar_to_id(kNullChar)); | |
| 179 } else { | |
| 180 // Add the direct_set unichar-ids of the unicodes in sequence to the | |
| 181 // code. | |
| 182 for (int uni : unicodes) { | |
| 183 int position = code.length(); | |
| 184 if (position >= RecodedCharID::kMaxCodeLen) { | |
| 185 tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u)); | |
| 186 return false; | |
| 187 } | |
| 188 UNICHAR unichar(uni); | |
| 189 char *utf8 = unichar.utf8_str(); | |
| 190 if (!direct_set.contains_unichar(utf8)) { | |
| 191 direct_set.unichar_insert(utf8); | |
| 192 } | |
| 193 code.Set(position, direct_set.unichar_to_id(utf8)); | |
| 194 delete[] utf8; | |
| 195 if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) { | |
| 196 // Code space got bigger! | |
| 197 tprintf("Code space expanded from original unicharset!!\n"); | |
| 198 return false; | |
| 199 } | |
| 200 } | |
| 201 } | |
| 202 } | |
| 203 encoder_.push_back(code); | |
| 204 } | |
| 205 // Now renumber Han to make all codes unique. We already added han_offset to | |
| 206 // all Han. Now separate out the radical, stroke, and count codes for Han. | |
| 207 int code_offset = 0; | |
| 208 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) { | |
| 209 int max_offset = 0; | |
| 210 for (unsigned u = 0; u < unicharset.size(); ++u) { | |
| 211 RecodedCharID *code = &encoder_[u]; | |
| 212 if (code->length() <= i) { | |
| 213 continue; | |
| 214 } | |
| 215 max_offset = std::max(max_offset, (*code)(i)-han_offset); | |
| 216 code->Set(i, (*code)(i) + code_offset); | |
| 217 } | |
| 218 if (max_offset == 0) { | |
| 219 break; | |
| 220 } | |
| 221 code_offset += max_offset + 1; | |
| 222 } | |
| 223 DefragmentCodeValues(null_id >= 0 ? 1 : -1); | |
| 224 SetupDecoder(); | |
| 225 return true; | |
| 226 } | |
| 227 | |
| 228 // Sets up an encoder that doesn't change the unichars at all, so it just | |
| 229 // passes them through unchanged. | |
| 230 void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) { | |
| 231 std::vector<RecodedCharID> codes; | |
| 232 for (unsigned u = 0; u < unicharset.size(); ++u) { | |
| 233 RecodedCharID code; | |
| 234 code.Set(0, u); | |
| 235 codes.push_back(code); | |
| 236 } | |
| 237 if (!unicharset.has_special_codes()) { | |
| 238 RecodedCharID code; | |
| 239 code.Set(0, unicharset.size()); | |
| 240 codes.push_back(code); | |
| 241 } | |
| 242 SetupDirect(codes); | |
| 243 } | |
| 244 | |
| 245 // Sets up an encoder directly using the given encoding vector, which maps | |
| 246 // unichar_ids to the given codes. | |
| 247 void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) { | |
| 248 encoder_ = codes; | |
| 249 ComputeCodeRange(); | |
| 250 SetupDecoder(); | |
| 251 } | |
| 252 | |
| 253 // Renumbers codes to eliminate unused values. | |
| 254 void UnicharCompress::DefragmentCodeValues(int encoded_null) { | |
| 255 // There may not be any Hangul, but even if there is, it is possible that not | |
| 256 // all codes are used. Likewise with the Han encoding, it is possible that not | |
| 257 // all numbers of strokes are used. | |
| 258 ComputeCodeRange(); | |
| 259 std::vector<int> offsets(code_range_); | |
| 260 // Find which codes are used | |
| 261 for (auto &code : encoder_) { | |
| 262 for (int i = 0; i < code.length(); ++i) { | |
| 263 offsets[code(i)] = 1; | |
| 264 } | |
| 265 } | |
| 266 // Compute offsets based on code use. | |
| 267 int offset = 0; | |
| 268 for (unsigned i = 0; i < offsets.size(); ++i) { | |
| 269 // If not used, decrement everything above here. | |
| 270 // We are moving encoded_null to the end, so it is not "used". | |
| 271 if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) { | |
| 272 --offset; | |
| 273 } else { | |
| 274 offsets[i] = offset; | |
| 275 } | |
| 276 } | |
| 277 if (encoded_null >= 0) { | |
| 278 // The encoded_null is moving to the end, for the benefit of TensorFlow, | |
| 279 // which is offsets.size() + offsets.back(). | |
| 280 offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null; | |
| 281 } | |
| 282 // Now apply the offsets. | |
| 283 for (auto &c : encoder_) { | |
| 284 RecodedCharID *code = &c; | |
| 285 for (int i = 0; i < code->length(); ++i) { | |
| 286 int value = (*code)(i); | |
| 287 code->Set(i, value + offsets[value]); | |
| 288 } | |
| 289 } | |
| 290 ComputeCodeRange(); | |
| 291 } | |
| 292 | |
| 293 // Encodes a single unichar_id. Returns the length of the code, or zero if | |
| 294 // invalid input, and the encoding itself | |
| 295 int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const { | |
| 296 if (unichar_id >= encoder_.size()) { | |
| 297 return 0; | |
| 298 } | |
| 299 *code = encoder_[unichar_id]; | |
| 300 return code->length(); | |
| 301 } | |
| 302 | |
| 303 // Decodes code, returning the original unichar-id, or | |
| 304 // INVALID_UNICHAR_ID if the input is invalid. | |
| 305 int UnicharCompress::DecodeUnichar(const RecodedCharID &code) const { | |
| 306 int len = code.length(); | |
| 307 if (len <= 0 || len > RecodedCharID::kMaxCodeLen) { | |
| 308 return INVALID_UNICHAR_ID; | |
| 309 } | |
| 310 auto it = decoder_.find(code); | |
| 311 if (it == decoder_.end()) { | |
| 312 return INVALID_UNICHAR_ID; | |
| 313 } | |
| 314 return it->second; | |
| 315 } | |
| 316 | |
| 317 // Writes to the given file. Returns false in case of error. | |
| 318 bool UnicharCompress::Serialize(TFile *fp) const { | |
| 319 return fp->Serialize(encoder_); | |
| 320 } | |
| 321 | |
| 322 // Reads from the given file. Returns false in case of error. | |
| 323 bool UnicharCompress::DeSerialize(TFile *fp) { | |
| 324 if (!fp->DeSerialize(encoder_)) { | |
| 325 return false; | |
| 326 } | |
| 327 ComputeCodeRange(); | |
| 328 SetupDecoder(); | |
| 329 return true; | |
| 330 } | |
| 331 | |
| 332 // Returns a string containing a text file that describes the encoding thus: | |
| 333 // <index>[,<index>]*<tab><UTF8-str><newline> | |
| 334 // In words, a comma-separated list of one or more indices, followed by a tab | |
| 335 // and the UTF-8 string that the code represents per line. Most simple scripts | |
| 336 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean | |
| 337 // and the Indic scripts will contain a many-to-many mapping. | |
| 338 // See the class comment above for details. | |
| 339 std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const { | |
| 340 std::string encoding; | |
| 341 for (unsigned c = 0; c < encoder_.size(); ++c) { | |
| 342 const RecodedCharID &code = encoder_[c]; | |
| 343 if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) { | |
| 344 // Don't show the duplicate entry. | |
| 345 continue; | |
| 346 } | |
| 347 encoding += std::to_string(code(0)); | |
| 348 for (int i = 1; i < code.length(); ++i) { | |
| 349 encoding += "," + std::to_string(code(i)); | |
| 350 } | |
| 351 encoding += "\t"; | |
| 352 if (c >= unicharset.size() || | |
| 353 (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) { | |
| 354 encoding += kNullChar; | |
| 355 } else { | |
| 356 encoding += unicharset.id_to_unichar(c); | |
| 357 } | |
| 358 encoding += "\n"; | |
| 359 } | |
| 360 return encoding; | |
| 361 } | |
| 362 | |
| 363 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing. | |
| 364 // Note that the returned values are 0-based indices, NOT unicode Jamo. | |
| 365 // Returns false if the input is not in the Hangul unicode range. | |
| 366 /* static */ | |
| 367 bool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) { | |
| 368 if (unicode < kFirstHangul) { | |
| 369 return false; | |
| 370 } | |
| 371 int offset = unicode - kFirstHangul; | |
| 372 if (offset >= kNumHangul) { | |
| 373 return false; | |
| 374 } | |
| 375 const int kNCount = kVCount * kTCount; | |
| 376 *leading = offset / kNCount; | |
| 377 *vowel = (offset % kNCount) / kTCount; | |
| 378 *trailing = offset % kTCount; | |
| 379 return true; | |
| 380 } | |
| 381 | |
| 382 // Computes the value of code_range_ from the encoder_. | |
| 383 void UnicharCompress::ComputeCodeRange() { | |
| 384 code_range_ = -1; | |
| 385 for (auto &code : encoder_) { | |
| 386 for (int i = 0; i < code.length(); ++i) { | |
| 387 if (code(i) > code_range_) { | |
| 388 code_range_ = code(i); | |
| 389 } | |
| 390 } | |
| 391 } | |
| 392 ++code_range_; | |
| 393 } | |
| 394 | |
| 395 // Initializes the decoding hash_map from the encoding array. | |
| 396 void UnicharCompress::SetupDecoder() { | |
| 397 Cleanup(); | |
| 398 is_valid_start_.clear(); | |
| 399 is_valid_start_.resize(code_range_); | |
| 400 for (unsigned c = 0; c < encoder_.size(); ++c) { | |
| 401 const RecodedCharID &code = encoder_[c]; | |
| 402 decoder_[code] = c; | |
| 403 is_valid_start_[code(0)] = true; | |
| 404 RecodedCharID prefix = code; | |
| 405 int len = code.length() - 1; | |
| 406 prefix.Truncate(len); | |
| 407 auto final_it = final_codes_.find(prefix); | |
| 408 if (final_it == final_codes_.end()) { | |
| 409 auto *code_list = new std::vector<int>; | |
| 410 code_list->push_back(code(len)); | |
| 411 final_codes_[prefix] = code_list; | |
| 412 while (--len >= 0) { | |
| 413 prefix.Truncate(len); | |
| 414 auto next_it = next_codes_.find(prefix); | |
| 415 if (next_it == next_codes_.end()) { | |
| 416 auto *code_list = new std::vector<int>; | |
| 417 code_list->push_back(code(len)); | |
| 418 next_codes_[prefix] = code_list; | |
| 419 } else { | |
| 420 // We still have to search the list as we may get here via multiple | |
| 421 // lengths of code. | |
| 422 if (!contains(*next_it->second, code(len))) { | |
| 423 next_it->second->push_back(code(len)); | |
| 424 } | |
| 425 break; // This prefix has been processed. | |
| 426 } | |
| 427 } | |
| 428 } else { | |
| 429 if (!contains(*final_it->second, code(len))) { | |
| 430 final_it->second->push_back(code(len)); | |
| 431 } | |
| 432 } | |
| 433 } | |
| 434 } | |
| 435 | |
| 436 // Frees allocated memory. | |
| 437 void UnicharCompress::Cleanup() { | |
| 438 decoder_.clear(); | |
| 439 is_valid_start_.clear(); | |
| 440 for (auto &next_code : next_codes_) { | |
| 441 delete next_code.second; | |
| 442 } | |
| 443 for (auto &final_code : final_codes_) { | |
| 444 delete final_code.second; | |
| 445 } | |
| 446 next_codes_.clear(); | |
| 447 final_codes_.clear(); | |
| 448 } | |
| 449 | |
| 450 } // namespace tesseract. |
