Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/ambigs.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: ambigs.cpp | |
| 3 // Description: Functions for dealing with ambiguities | |
| 4 // (training and recognition). | |
| 5 // Author: Daria Antonova | |
| 6 // | |
| 7 // (C) Copyright 2008, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #include "ambigs.h" | |
| 21 | |
| 22 #include "helpers.h" | |
| 23 #include "universalambigs.h" | |
| 24 | |
| 25 #include <cstdio> | |
| 26 | |
| 27 #if defined(_WIN32) && !defined(__GNUC__) | |
| 28 # define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) | |
| 29 #endif /* _WIN32 && !__GNUC__ */ | |
| 30 | |
| 31 namespace tesseract { | |
| 32 | |
| 33 static const char kAmbigDelimiters[] = "\t "; | |
| 34 static const char kIllegalMsg[] = "Illegal ambiguity specification on line %d\n"; | |
| 35 static const char kIllegalUnicharMsg[] = "Illegal unichar %s in ambiguity specification\n"; | |
| 36 | |
| 37 // Maximum line size: | |
| 38 // 10 for sizes of ambigs, tabs, abmig type and newline | |
| 39 // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig | |
| 40 const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); | |
| 41 | |
| 42 AmbigSpec::AmbigSpec() : correct_ngram_id(INVALID_UNICHAR_ID), type(NOT_AMBIG), wrong_ngram_size(0) { | |
| 43 wrong_ngram[0] = INVALID_UNICHAR_ID; | |
| 44 correct_fragments[0] = INVALID_UNICHAR_ID; | |
| 45 } | |
| 46 | |
| 47 // Initializes the ambigs by adding a nullptr pointer to each table. | |
| 48 void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption) { | |
| 49 for (unsigned i = 0; i < unicharset.size(); ++i) { | |
| 50 replace_ambigs_.push_back(nullptr); | |
| 51 dang_ambigs_.push_back(nullptr); | |
| 52 one_to_one_definite_ambigs_.push_back(nullptr); | |
| 53 if (use_ambigs_for_adaption) { | |
| 54 ambigs_for_adaption_.push_back(nullptr); | |
| 55 reverse_ambigs_for_adaption_.push_back(nullptr); | |
| 56 } | |
| 57 } | |
| 58 } | |
| 59 | |
| 60 // Loads the universal ambigs that are useful for any language. | |
| 61 void UnicharAmbigs::LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset) { | |
| 62 TFile file; | |
| 63 if (!file.Open(kUniversalAmbigsFile, ksizeofUniversalAmbigsFile)) { | |
| 64 return; | |
| 65 } | |
| 66 LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset); | |
| 67 } | |
| 68 | |
| 69 void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambig_file, | |
| 70 int debug_level, bool use_ambigs_for_adaption, | |
| 71 UNICHARSET *unicharset) { | |
| 72 UnicharIdVector *adaption_ambigs_entry; | |
| 73 if (debug_level) { | |
| 74 tprintf("Reading ambiguities\n"); | |
| 75 } | |
| 76 | |
| 77 int test_ambig_part_size; | |
| 78 int replacement_ambig_part_size; | |
| 79 // The space for buffer is allocated on the heap to avoid | |
| 80 // GCC frame size warning. | |
| 81 const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; | |
| 82 char *buffer = new char[kBufferSize]; | |
| 83 char replacement_string[kMaxAmbigStringSize]; | |
| 84 UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1]; | |
| 85 int line_num = 0; | |
| 86 int type = NOT_AMBIG; | |
| 87 | |
| 88 // Determine the version of the ambigs file. | |
| 89 int version = 0; | |
| 90 ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != nullptr && buffer[0] != '\0'); | |
| 91 if (*buffer == 'v') { | |
| 92 version = static_cast<int>(strtol(buffer + 1, nullptr, 10)); | |
| 93 ++line_num; | |
| 94 } else { | |
| 95 ambig_file->Rewind(); | |
| 96 } | |
| 97 while (ambig_file->FGets(buffer, kBufferSize) != nullptr) { | |
| 98 chomp_string(buffer); | |
| 99 if (debug_level > 2) { | |
| 100 tprintf("read line %s\n", buffer); | |
| 101 } | |
| 102 ++line_num; | |
| 103 if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer, | |
| 104 &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size, | |
| 105 replacement_string, &type)) { | |
| 106 continue; | |
| 107 } | |
| 108 // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. | |
| 109 auto *ambig_spec = new AmbigSpec(); | |
| 110 if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, | |
| 111 test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size, | |
| 112 replacement_string, type, ambig_spec, unicharset)) { | |
| 113 continue; | |
| 114 } | |
| 115 | |
| 116 // Update one_to_one_definite_ambigs_. | |
| 117 if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { | |
| 118 if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == nullptr) { | |
| 119 one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector(); | |
| 120 } | |
| 121 one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(ambig_spec->correct_ngram_id); | |
| 122 } | |
| 123 // Update ambigs_for_adaption_. | |
| 124 if (use_ambigs_for_adaption) { | |
| 125 std::vector<UNICHAR_ID> encoding; | |
| 126 // Silently ignore invalid strings, as before, so it is safe to use a | |
| 127 // universal ambigs file. | |
| 128 if (unicharset->encode_string(replacement_string, true, &encoding, nullptr, nullptr)) { | |
| 129 for (int i = 0; i < test_ambig_part_size; ++i) { | |
| 130 if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) { | |
| 131 ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector(); | |
| 132 } | |
| 133 adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]]; | |
| 134 for (int id_to_insert : encoding) { | |
| 135 ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); | |
| 136 // Add the new unichar id to adaption_ambigs_entry (only if the | |
| 137 // vector does not already contain it) keeping it in sorted order. | |
| 138 size_t j; | |
| 139 for (j = 0; | |
| 140 j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert; | |
| 141 ++j) { | |
| 142 } | |
| 143 if (j < adaption_ambigs_entry->size()) { | |
| 144 if ((*adaption_ambigs_entry)[j] != id_to_insert) { | |
| 145 adaption_ambigs_entry->insert(adaption_ambigs_entry->begin() + j, id_to_insert); | |
| 146 } | |
| 147 } else { | |
| 148 adaption_ambigs_entry->push_back(id_to_insert); | |
| 149 } | |
| 150 } | |
| 151 } | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 delete[] buffer; | |
| 156 | |
| 157 // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. | |
| 158 if (use_ambigs_for_adaption) { | |
| 159 for (size_t i = 0; i < ambigs_for_adaption_.size(); ++i) { | |
| 160 adaption_ambigs_entry = ambigs_for_adaption_[i]; | |
| 161 if (adaption_ambigs_entry == nullptr) { | |
| 162 continue; | |
| 163 } | |
| 164 for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) { | |
| 165 UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; | |
| 166 if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) { | |
| 167 reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); | |
| 168 } | |
| 169 reverse_ambigs_for_adaption_[ambig_id]->push_back(i); | |
| 170 } | |
| 171 } | |
| 172 } | |
| 173 | |
| 174 // Print what was read from the input file. | |
| 175 if (debug_level > 1) { | |
| 176 for (int tbl = 0; tbl < 2; ++tbl) { | |
| 177 const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_; | |
| 178 for (size_t i = 0; i < print_table.size(); ++i) { | |
| 179 AmbigSpec_LIST *lst = print_table[i]; | |
| 180 if (lst == nullptr) { | |
| 181 continue; | |
| 182 } | |
| 183 if (!lst->empty()) { | |
| 184 tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", | |
| 185 unicharset->debug_str(i).c_str()); | |
| 186 } | |
| 187 AmbigSpec_IT lst_it(lst); | |
| 188 for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { | |
| 189 AmbigSpec *ambig_spec = lst_it.data(); | |
| 190 tprintf("wrong_ngram:"); | |
| 191 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); | |
| 192 tprintf("correct_fragments:"); | |
| 193 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); | |
| 194 } | |
| 195 } | |
| 196 } | |
| 197 if (use_ambigs_for_adaption) { | |
| 198 for (int vec_id = 0; vec_id < 2; ++vec_id) { | |
| 199 const std::vector<UnicharIdVector *> &vec = | |
| 200 (vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_; | |
| 201 for (size_t i = 0; i < vec.size(); ++i) { | |
| 202 adaption_ambigs_entry = vec[i]; | |
| 203 if (adaption_ambigs_entry != nullptr) { | |
| 204 tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ", | |
| 205 unicharset->debug_str(i).c_str()); | |
| 206 for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) { | |
| 207 tprintf("%s ", unicharset->debug_str((*adaption_ambigs_entry)[j]).c_str()); | |
| 208 } | |
| 209 tprintf("\n"); | |
| 210 } | |
| 211 } | |
| 212 } | |
| 213 } | |
| 214 } | |
| 215 } | |
| 216 | |
| 217 bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_level, | |
| 218 const UNICHARSET &unicharset, char *buffer, | |
| 219 int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, | |
| 220 int *replacement_ambig_part_size, char *replacement_string, | |
| 221 int *type) { | |
| 222 if (version > 1) { | |
| 223 // Simpler format is just wrong-string correct-string type\n. | |
| 224 std::string input(buffer); | |
| 225 std::vector<std::string> fields = split(input, ' '); | |
| 226 if (fields.size() != 3) { | |
| 227 if (debug_level) { | |
| 228 tprintf(kIllegalMsg, line_num); | |
| 229 } | |
| 230 return false; | |
| 231 } | |
| 232 // Encode wrong-string. | |
| 233 std::vector<UNICHAR_ID> unichars; | |
| 234 if (!unicharset.encode_string(fields[0].c_str(), true, &unichars, nullptr, nullptr)) { | |
| 235 return false; | |
| 236 } | |
| 237 *test_ambig_part_size = unichars.size(); | |
| 238 if (*test_ambig_part_size > MAX_AMBIG_SIZE) { | |
| 239 if (debug_level) { | |
| 240 tprintf("Too many unichars in ambiguity on line %d\n", line_num); | |
| 241 } | |
| 242 return false; | |
| 243 } | |
| 244 // Copy encoded string to output. | |
| 245 for (size_t i = 0; i < unichars.size(); ++i) { | |
| 246 test_unichar_ids[i] = unichars[i]; | |
| 247 } | |
| 248 test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID; | |
| 249 // Encode replacement-string to check validity. | |
| 250 if (!unicharset.encode_string(fields[1].c_str(), true, &unichars, nullptr, nullptr)) { | |
| 251 return false; | |
| 252 } | |
| 253 *replacement_ambig_part_size = unichars.size(); | |
| 254 if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { | |
| 255 if (debug_level) { | |
| 256 tprintf("Too many unichars in ambiguity on line %d\n", line_num); | |
| 257 } | |
| 258 return false; | |
| 259 } | |
| 260 if (sscanf(fields[2].c_str(), "%d", type) != 1) { | |
| 261 if (debug_level) { | |
| 262 tprintf(kIllegalMsg, line_num); | |
| 263 } | |
| 264 return false; | |
| 265 } | |
| 266 snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].c_str()); | |
| 267 return true; | |
| 268 } | |
| 269 int i; | |
| 270 char *next_token; | |
| 271 char *token = strtok_r(buffer, kAmbigDelimiters, &next_token); | |
| 272 if (!token || sscanf(token, "%d", test_ambig_part_size) != 1 || | |
| 273 *test_ambig_part_size <= 0) { | |
| 274 if (debug_level) { | |
| 275 tprintf(kIllegalMsg, line_num); | |
| 276 } | |
| 277 return false; | |
| 278 } | |
| 279 if (*test_ambig_part_size > MAX_AMBIG_SIZE) { | |
| 280 if (debug_level) { | |
| 281 tprintf("Too many unichars in ambiguity on line %d\n", line_num); | |
| 282 } | |
| 283 return false; | |
| 284 } | |
| 285 for (i = 0; i < *test_ambig_part_size; ++i) { | |
| 286 if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) { | |
| 287 break; | |
| 288 } | |
| 289 if (!unicharset.contains_unichar(token)) { | |
| 290 if (debug_level) { | |
| 291 tprintf(kIllegalUnicharMsg, token); | |
| 292 } | |
| 293 break; | |
| 294 } | |
| 295 test_unichar_ids[i] = unicharset.unichar_to_id(token); | |
| 296 } | |
| 297 test_unichar_ids[i] = INVALID_UNICHAR_ID; | |
| 298 | |
| 299 if (i != *test_ambig_part_size || !(token = strtok_r(nullptr, kAmbigDelimiters, &next_token)) || | |
| 300 sscanf(token, "%d", replacement_ambig_part_size) != 1 || | |
| 301 *replacement_ambig_part_size <= 0) { | |
| 302 if (debug_level) { | |
| 303 tprintf(kIllegalMsg, line_num); | |
| 304 } | |
| 305 return false; | |
| 306 } | |
| 307 if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { | |
| 308 if (debug_level) { | |
| 309 tprintf("Too many unichars in ambiguity on line %d\n", line_num); | |
| 310 } | |
| 311 return false; | |
| 312 } | |
| 313 replacement_string[0] = '\0'; | |
| 314 for (i = 0; i < *replacement_ambig_part_size; ++i) { | |
| 315 if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) { | |
| 316 break; | |
| 317 } | |
| 318 strcat(replacement_string, token); | |
| 319 if (!unicharset.contains_unichar(token)) { | |
| 320 if (debug_level) { | |
| 321 tprintf(kIllegalUnicharMsg, token); | |
| 322 } | |
| 323 break; | |
| 324 } | |
| 325 } | |
| 326 if (i != *replacement_ambig_part_size) { | |
| 327 if (debug_level) { | |
| 328 tprintf(kIllegalMsg, line_num); | |
| 329 } | |
| 330 return false; | |
| 331 } | |
| 332 if (version > 0) { | |
| 333 // The next field being true indicates that the ambiguity should | |
| 334 // always be substituted (e.g. '' should always be changed to "). | |
| 335 // For such "certain" n -> m ambigs tesseract will insert character | |
| 336 // fragments for the n pieces in the unicharset. AmbigsFound() | |
| 337 // will then replace the incorrect ngram with the character | |
| 338 // fragments of the correct character (or ngram if m > 1). | |
| 339 // Note that if m > 1, an ngram will be inserted into the | |
| 340 // modified word, not the individual unigrams. Tesseract | |
| 341 // has limited support for ngram unichar (e.g. dawg permuter). | |
| 342 token = strtok_r(nullptr, kAmbigDelimiters, &next_token); | |
| 343 if (!token || sscanf(token, "%d", type) != 1) { | |
| 344 if (debug_level) { | |
| 345 tprintf(kIllegalMsg, line_num); | |
| 346 } | |
| 347 return false; | |
| 348 } | |
| 349 } | |
| 350 return true; | |
| 351 } | |
| 352 | |
| 353 bool UnicharAmbigs::InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size, | |
| 354 UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, | |
| 355 const char *replacement_string, int type, AmbigSpec *ambig_spec, | |
| 356 UNICHARSET *unicharset) { | |
| 357 ambig_spec->type = static_cast<AmbigType>(type); | |
| 358 if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && | |
| 359 unicharset->to_lower(test_unichar_ids[0]) == | |
| 360 unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) { | |
| 361 ambig_spec->type = CASE_AMBIG; | |
| 362 } | |
| 363 | |
| 364 ambig_spec->wrong_ngram_size = | |
| 365 UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram); | |
| 366 | |
| 367 // Since we need to maintain a constant number of unichar positions in | |
| 368 // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for | |
| 369 // each n->m ambiguity we will have to place n character fragments of the | |
| 370 // correct ngram into the corresponding positions in the vector (e.g. given | |
| 371 // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and | |
| 372 // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed | |
| 373 // from fragments by dawg_permute_and_select(). | |
| 374 | |
| 375 // Insert the corresponding correct ngram into the unicharset. | |
| 376 // Unicharset code assumes that the "base" ngram is inserted into | |
| 377 // the unicharset before fragments of this ngram are inserted. | |
| 378 unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue); | |
| 379 ambig_spec->correct_ngram_id = unicharset->unichar_to_id(replacement_string); | |
| 380 if (replacement_ambig_part_size > 1) { | |
| 381 unicharset->set_isngram(ambig_spec->correct_ngram_id, true); | |
| 382 } | |
| 383 // Add the corresponding fragments of the wrong ngram to unicharset. | |
| 384 int i; | |
| 385 for (i = 0; i < test_ambig_part_size; ++i) { | |
| 386 UNICHAR_ID unichar_id; | |
| 387 if (test_ambig_part_size == 1) { | |
| 388 unichar_id = ambig_spec->correct_ngram_id; | |
| 389 } else { | |
| 390 std::string frag_str = | |
| 391 CHAR_FRAGMENT::to_string(replacement_string, i, test_ambig_part_size, false); | |
| 392 unicharset->unichar_insert(frag_str.c_str(), OldUncleanUnichars::kTrue); | |
| 393 unichar_id = unicharset->unichar_to_id(frag_str.c_str()); | |
| 394 } | |
| 395 ambig_spec->correct_fragments[i] = unichar_id; | |
| 396 } | |
| 397 ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID; | |
| 398 | |
| 399 // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST. | |
| 400 // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram. | |
| 401 if (table[test_unichar_ids[0]] == nullptr) { | |
| 402 table[test_unichar_ids[0]] = new AmbigSpec_LIST(); | |
| 403 } | |
| 404 if (table[test_unichar_ids[0]]->add_sorted(AmbigSpec::compare_ambig_specs, true, ambig_spec)) { | |
| 405 return true; | |
| 406 } | |
| 407 delete ambig_spec; | |
| 408 return false; | |
| 409 } | |
| 410 | |
| 411 } // namespace tesseract |
