Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharcompress.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: unicharcompress.h | |
| 3 // Description: Unicode re-encoding using a sequence of smaller numbers in | |
| 4 // place of a single large code for CJK, similarly for Indic, | |
| 5 // and dissection of ligatures for other scripts. | |
| 6 // Author: Ray Smith | |
| 7 // | |
| 8 // (C) Copyright 2015, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 /////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ | |
| 22 #define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ | |
| 23 | |
| 24 #include <unordered_map> | |
| 25 #include <vector> | |
| 26 #include "serialis.h" | |
| 27 #include "unicharset.h" | |
| 28 | |
| 29 namespace tesseract { | |
| 30 | |
| 31 // Trivial class to hold the code for a recoded unichar-id. | |
| 32 class RecodedCharID { | |
| 33 public: | |
| 34 // The maximum length of a code. | |
| 35 static const int kMaxCodeLen = 9; | |
| 36 | |
| 37 RecodedCharID() : self_normalized_(1), length_(0) { | |
| 38 memset(code_, 0, sizeof(code_)); | |
| 39 } | |
| 40 void Truncate(int length) { | |
| 41 length_ = length; | |
| 42 } | |
| 43 // Sets the code value at the given index in the code. | |
| 44 void Set(int index, int value) { | |
| 45 code_[index] = value; | |
| 46 if (length_ <= index) { | |
| 47 length_ = index + 1; | |
| 48 } | |
| 49 } | |
| 50 // Shorthand for setting codes of length 3, as all Hangul and Han codes are | |
| 51 // length 3. | |
| 52 void Set3(int code0, int code1, int code2) { | |
| 53 length_ = 3; | |
| 54 code_[0] = code0; | |
| 55 code_[1] = code1; | |
| 56 code_[2] = code2; | |
| 57 } | |
| 58 bool empty() const { | |
| 59 return length_ == 0; | |
| 60 } | |
| 61 // Accessors | |
| 62 int length() const { | |
| 63 return length_; | |
| 64 } | |
| 65 int operator()(int index) const { | |
| 66 return code_[index]; | |
| 67 } | |
| 68 | |
| 69 // Writes to the given file. Returns false in case of error. | |
| 70 bool Serialize(TFile *fp) const { | |
| 71 return fp->Serialize(&self_normalized_) && fp->Serialize(&length_) && | |
| 72 fp->Serialize(&code_[0], length_); | |
| 73 } | |
| 74 // Reads from the given file. Returns false in case of error. | |
| 75 bool DeSerialize(TFile *fp) { | |
| 76 return fp->DeSerialize(&self_normalized_) && fp->DeSerialize(&length_) && | |
| 77 fp->DeSerialize(&code_[0], length_); | |
| 78 } | |
| 79 bool operator==(const RecodedCharID &other) const { | |
| 80 if (length_ != other.length_) { | |
| 81 return false; | |
| 82 } | |
| 83 for (int i = 0; i < length_; ++i) { | |
| 84 if (code_[i] != other.code_[i]) { | |
| 85 return false; | |
| 86 } | |
| 87 } | |
| 88 return true; | |
| 89 } | |
| 90 // Hash functor for RecodedCharID. | |
| 91 struct RecodedCharIDHash { | |
| 92 uint64_t operator()(const RecodedCharID &code) const { | |
| 93 uint64_t result = 0; | |
| 94 for (int i = 0; i < code.length_; ++i) { | |
| 95 result ^= static_cast<uint64_t>(code(i)) << (7 * i); | |
| 96 } | |
| 97 return result; | |
| 98 } | |
| 99 }; | |
| 100 | |
| 101 private: | |
| 102 // True if this code is self-normalizing, ie is the master entry for indices | |
| 103 // that map to the same code. Has boolean value, but int8_t for serialization. | |
| 104 int8_t self_normalized_; | |
| 105 // The number of elements in use in code_; | |
| 106 int32_t length_; | |
| 107 // The re-encoded form of the unichar-id to which this RecodedCharID relates. | |
| 108 int32_t code_[kMaxCodeLen]; | |
| 109 }; | |
| 110 | |
| 111 // Class holds a "compression" of a unicharset to simplify the learning problem | |
| 112 // for a neural-network-based classifier. | |
| 113 // Objectives: | |
| 114 // 1 (CJK): Ids of a unicharset with a large number of classes are expressed as | |
| 115 // a sequence of 3 codes with much fewer values. | |
| 116 // This is achieved using the Jamo coding for Hangul and the Unicode | |
| 117 // Radical-Stroke-index for Han. | |
| 118 // 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code | |
| 119 // as the unicode sequence (but coded in a more compact space). | |
| 120 // 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing | |
| 121 // and not significantly distinct shapes (quotes) together, ie | |
| 122 // represent the fi ligature as the f-i pair, and fold u+2019 and | |
| 123 // friends all onto ascii single ' | |
| 124 // 4 The null character and mapping to target activations: | |
| 125 // To save horizontal coding space, the compressed codes are generally mapped | |
| 126 // to target network activations without intervening null characters, BUT | |
| 127 // in the case of ligatures, such as ff, null characters have to be included | |
| 128 // so existence of repeated codes is detected at codebook-building time, and | |
| 129 // null characters are embedded directly into the codes, so the rest of the | |
| 130 // system doesn't need to worry about the problem (much). There is still an | |
| 131 // effect on the range of ways in which the target activations can be | |
| 132 // generated. | |
| 133 // | |
| 134 // The computed code values are compact (no unused values), and, for CJK, | |
| 135 // unique (each code position uses a disjoint set of values from each other code | |
| 136 // position). For non-CJK, the same code value CAN be used in multiple | |
| 137 // positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f> | |
| 138 // is the same code as is used for the single f. | |
| 139 class TESS_API UnicharCompress { | |
| 140 public: | |
| 141 UnicharCompress(); | |
| 142 UnicharCompress(const UnicharCompress &src); | |
| 143 ~UnicharCompress(); | |
| 144 UnicharCompress &operator=(const UnicharCompress &src); | |
| 145 | |
| 146 // The 1st Hangul unicode. | |
| 147 static const int kFirstHangul = 0xac00; | |
| 148 // The number of Hangul unicodes. | |
| 149 static const int kNumHangul = 11172; | |
| 150 // The number of Jamos for each of the 3 parts of a Hangul character, being | |
| 151 // the Leading consonant, Vowel and Trailing consonant. | |
| 152 static const int kLCount = 19; | |
| 153 static const int kVCount = 21; | |
| 154 static const int kTCount = 28; | |
| 155 | |
| 156 // Computes the encoding for the given unicharset. It is a requirement that | |
| 157 // the file training/langdata/radical-stroke.txt have been read into the | |
| 158 // input string radical_stroke_table. | |
| 159 // Returns false if the encoding cannot be constructed. | |
| 160 bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table); | |
| 161 // Sets up an encoder that doesn't change the unichars at all, so it just | |
| 162 // passes them through unchanged. | |
| 163 void SetupPassThrough(const UNICHARSET &unicharset); | |
| 164 // Sets up an encoder directly using the given encoding vector, which maps | |
| 165 // unichar_ids to the given codes. | |
| 166 void SetupDirect(const std::vector<RecodedCharID> &codes); | |
| 167 | |
| 168 // Returns the number of different values that can be used in a code, ie | |
| 169 // 1 + the maximum value that will ever be used by an RecodedCharID code in | |
| 170 // any position in its array. | |
| 171 int code_range() const { | |
| 172 return code_range_; | |
| 173 } | |
| 174 | |
| 175 // Encodes a single unichar_id. Returns the length of the code, (or zero if | |
| 176 // invalid input), and the encoding itself in code. | |
| 177 int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const; | |
| 178 // Decodes code, returning the original unichar-id, or | |
| 179 // INVALID_UNICHAR_ID if the input is invalid. | |
| 180 int DecodeUnichar(const RecodedCharID &code) const; | |
| 181 // Returns true if the given code is a valid start or single code. | |
| 182 bool IsValidFirstCode(int code) const { | |
| 183 return is_valid_start_[code]; | |
| 184 } | |
| 185 // Returns a list of valid non-final next codes for a given prefix code, | |
| 186 // which may be empty. | |
| 187 const std::vector<int> *GetNextCodes(const RecodedCharID &code) const { | |
| 188 auto it = next_codes_.find(code); | |
| 189 return it == next_codes_.end() ? nullptr : it->second; | |
| 190 } | |
| 191 // Returns a list of valid final codes for a given prefix code, which may | |
| 192 // be empty. | |
| 193 const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const { | |
| 194 auto it = final_codes_.find(code); | |
| 195 return it == final_codes_.end() ? nullptr : it->second; | |
| 196 } | |
| 197 | |
| 198 // Writes to the given file. Returns false in case of error. | |
| 199 bool Serialize(TFile *fp) const; | |
| 200 // Reads from the given file. Returns false in case of error. | |
| 201 | |
| 202 bool DeSerialize(TFile *fp); | |
| 203 | |
| 204 // Returns a string containing a text file that describes the encoding thus: | |
| 205 // <index>[,<index>]*<tab><UTF8-str><newline> | |
| 206 // In words, a comma-separated list of one or more indices, followed by a tab | |
| 207 // and the UTF-8 string that the code represents per line. Most simple scripts | |
| 208 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean | |
| 209 // and the Indic scripts will contain a many-to-many mapping. | |
| 210 // See the class comment above for details. | |
| 211 std::string GetEncodingAsString(const UNICHARSET &unicharset) const; | |
| 212 | |
| 213 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing. | |
| 214 // Note that the returned values are 0-based indices, NOT unicode Jamo. | |
| 215 // Returns false if the input is not in the Hangul unicode range. | |
| 216 static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing); | |
| 217 | |
| 218 private: | |
| 219 // Renumbers codes to eliminate unused values. | |
| 220 void DefragmentCodeValues(int encoded_null); | |
| 221 // Computes the value of code_range_ from the encoder_. | |
| 222 void ComputeCodeRange(); | |
| 223 // Initializes the decoding hash_map from the encoder_ array. | |
| 224 void SetupDecoder(); | |
| 225 // Frees allocated memory. | |
| 226 void Cleanup(); | |
| 227 | |
| 228 // The encoder that maps a unichar-id to a sequence of small codes. | |
| 229 // encoder_ is the only part that is serialized. The rest is computed on load. | |
| 230 std::vector<RecodedCharID> encoder_; | |
| 231 // Decoder converts the output of encoder back to a unichar-id. | |
| 232 std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_; | |
| 233 // True if the index is a valid single or start code. | |
| 234 std::vector<bool> is_valid_start_; | |
| 235 // Maps a prefix code to a list of valid next codes. | |
| 236 // The map owns the vectors. | |
| 237 std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash> | |
| 238 next_codes_; | |
| 239 // Maps a prefix code to a list of valid final codes. | |
| 240 // The map owns the vectors. | |
| 241 std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash> | |
| 242 final_codes_; | |
| 243 // Max of any value in encoder_ + 1. | |
| 244 int code_range_; | |
| 245 }; | |
| 246 | |
| 247 } // namespace tesseract. | |
| 248 | |
| 249 #endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ |
