Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/pango/ligature_table.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: ligature_table.cpp | |
| 3 * Description: Class for adding and removing optional latin ligatures, | |
| 4 * conditional on codepoint support by a specified font | |
| 5 * (if specified). | |
| 6 * Author: Ranjith Unnikrishnan | |
| 7 * | |
| 8 * (C) Copyright 2013, Google Inc. | |
| 9 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 * you may not use this file except in compliance with the License. | |
| 11 * You may obtain a copy of the License at | |
| 12 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 * Unless required by applicable law or agreed to in writing, software | |
| 14 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 * See the License for the specific language governing permissions and | |
| 17 * limitations under the License. | |
| 18 * | |
| 19 **********************************************************************/ | |
| 20 | |
| 21 #include "ligature_table.h" | |
| 22 | |
| 23 #include <tesseract/unichar.h> | |
| 24 #include "pango_font_info.h" | |
| 25 #include "tlog.h" | |
| 26 #include "unicharset.h" | |
| 27 #include "unicode/errorcode.h" // from libicu | |
| 28 #include "unicode/normlzr.h" // from libicu | |
| 29 #include "unicode/unistr.h" // from libicu | |
| 30 #include "unicode/utypes.h" // from libicu | |
| 31 | |
| 32 #include <utility> | |
| 33 | |
| 34 namespace tesseract { | |
| 35 | |
| 36 static std::string EncodeAsUTF8(const char32 ch32) { | |
| 37 UNICHAR uni_ch(ch32); | |
| 38 return std::string(uni_ch.utf8(), uni_ch.utf8_len()); | |
| 39 } | |
| 40 | |
| 41 // Range of optional latin ligature characters in Unicode to build ligatures | |
| 42 // from. Note that this range does not contain the custom ligatures that we | |
| 43 // encode in the private use area. | |
| 44 const int kMinLigature = 0xfb00; | |
| 45 const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in. | |
| 46 | |
| 47 /* static */ | |
| 48 std::unique_ptr<LigatureTable> LigatureTable::instance_; | |
| 49 | |
| 50 /* static */ | |
| 51 LigatureTable *LigatureTable::Get() { | |
| 52 if (instance_ == nullptr) { | |
| 53 instance_.reset(new LigatureTable()); | |
| 54 instance_->Init(); | |
| 55 } | |
| 56 return instance_.get(); | |
| 57 } | |
| 58 | |
| 59 LigatureTable::LigatureTable() | |
| 60 : min_lig_length_(0), max_lig_length_(0), min_norm_length_(0), max_norm_length_(0) {} | |
| 61 | |
| 62 void LigatureTable::Init() { | |
| 63 if (norm_to_lig_table_.empty()) { | |
| 64 for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) { | |
| 65 // For each char in the range, convert to utf8, nfc normalize, and if | |
| 66 // the strings are different put the both mappings in the hash_maps. | |
| 67 std::string lig8 = EncodeAsUTF8(lig); | |
| 68 icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig)); | |
| 69 icu::UnicodeString normed8_result; | |
| 70 icu::ErrorCode status; | |
| 71 icu::Normalizer::normalize(unicode_lig8, UNORM_NFC, 0, normed8_result, status); | |
| 72 std::string normed8; | |
| 73 normed8_result.toUTF8String(normed8); | |
| 74 int lig_length = lig8.length(); | |
| 75 int norm_length = normed8.size(); | |
| 76 if (normed8 != lig8 && lig_length > 1 && norm_length > 1) { | |
| 77 norm_to_lig_table_[normed8] = lig8; | |
| 78 lig_to_norm_table_[lig8] = std::move(normed8); | |
| 79 if (min_lig_length_ == 0 || lig_length < min_lig_length_) { | |
| 80 min_lig_length_ = lig_length; | |
| 81 } | |
| 82 if (lig_length > max_lig_length_) { | |
| 83 max_lig_length_ = lig_length; | |
| 84 } | |
| 85 if (min_norm_length_ == 0 || norm_length < min_norm_length_) { | |
| 86 min_norm_length_ = norm_length; | |
| 87 } | |
| 88 if (norm_length > max_norm_length_) { | |
| 89 max_norm_length_ = norm_length; | |
| 90 } | |
| 91 } | |
| 92 } | |
| 93 // Add custom extra ligatures. | |
| 94 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { | |
| 95 norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] = UNICHARSET::kCustomLigatures[i][1]; | |
| 96 int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]); | |
| 97 if (min_norm_length_ == 0 || norm_length < min_norm_length_) { | |
| 98 min_norm_length_ = norm_length; | |
| 99 } | |
| 100 if (norm_length > max_norm_length_) { | |
| 101 max_norm_length_ = norm_length; | |
| 102 } | |
| 103 | |
| 104 lig_to_norm_table_[UNICHARSET::kCustomLigatures[i][1]] = UNICHARSET::kCustomLigatures[i][0]; | |
| 105 } | |
| 106 } | |
| 107 } | |
| 108 | |
| 109 std::string LigatureTable::RemoveLigatures(const std::string &str) const { | |
| 110 std::string result; | |
| 111 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length()); | |
| 112 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); | |
| 113 char tmp[5]; | |
| 114 int len; | |
| 115 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { | |
| 116 len = it.get_utf8(tmp); | |
| 117 tmp[len] = '\0'; | |
| 118 auto lig_it = lig_to_norm_table_.find(tmp); | |
| 119 if (lig_it != lig_to_norm_table_.end()) { | |
| 120 result += lig_it->second; | |
| 121 } else { | |
| 122 result += tmp; | |
| 123 } | |
| 124 } | |
| 125 return result; | |
| 126 } | |
| 127 | |
| 128 std::string LigatureTable::RemoveCustomLigatures(const std::string &str) const { | |
| 129 std::string result; | |
| 130 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length()); | |
| 131 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); | |
| 132 char tmp[5]; | |
| 133 int len; | |
| 134 int norm_ind; | |
| 135 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { | |
| 136 len = it.get_utf8(tmp); | |
| 137 tmp[len] = '\0'; | |
| 138 norm_ind = -1; | |
| 139 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) { | |
| 140 if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) { | |
| 141 norm_ind = i; | |
| 142 } | |
| 143 } | |
| 144 if (norm_ind >= 0) { | |
| 145 result += UNICHARSET::kCustomLigatures[norm_ind][0]; | |
| 146 } else { | |
| 147 result += tmp; | |
| 148 } | |
| 149 } | |
| 150 return result; | |
| 151 } | |
| 152 | |
| 153 std::string LigatureTable::AddLigatures(const std::string &str, const PangoFontInfo *font) const { | |
| 154 std::string result; | |
| 155 int len = str.size(); | |
| 156 int step = 0; | |
| 157 int i = 0; | |
| 158 for (i = 0; i < len - min_norm_length_ + 1; i += step) { | |
| 159 step = 0; | |
| 160 for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) { | |
| 161 if (i + liglen <= len) { | |
| 162 std::string lig_cand = str.substr(i, liglen); | |
| 163 auto it = norm_to_lig_table_.find(lig_cand); | |
| 164 if (it != norm_to_lig_table_.end()) { | |
| 165 tlog(3, "Considering %s -> %s\n", lig_cand.c_str(), it->second.c_str()); | |
| 166 if (font) { | |
| 167 // Test for renderability. | |
| 168 if (!font->CanRenderString(it->second.data(), it->second.length())) { | |
| 169 continue; // Not renderable | |
| 170 } | |
| 171 } | |
| 172 // Found a match so convert it. | |
| 173 step = liglen; | |
| 174 result += it->second; | |
| 175 tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(), it->second.c_str()); | |
| 176 break; | |
| 177 } | |
| 178 } | |
| 179 } | |
| 180 if (step == 0) { | |
| 181 result += str[i]; | |
| 182 step = 1; | |
| 183 } | |
| 184 } | |
| 185 result += str.substr(i, len - i); | |
| 186 return result; | |
| 187 } | |
| 188 | |
| 189 } // namespace tesseract |
