Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/dict.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: dict.cpp | |
| 3 // Description: dict class. | |
| 4 // Author: Samuel Charron | |
| 5 // | |
| 6 // (C) Copyright 2006, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "dict.h" | |
| 20 | |
| 21 #include "tesserrstream.h" // for tesserr | |
| 22 #include "tprintf.h" | |
| 23 | |
| 24 #include <cstdio> | |
| 25 | |
| 26 namespace tesseract { | |
| 27 | |
| 28 class Image; | |
| 29 | |
| 30 Dict::Dict(CCUtil *ccutil) | |
| 31 : letter_is_okay_(&tesseract::Dict::def_letter_is_okay) | |
| 32 , probability_in_context_(&tesseract::Dict::def_probability_in_context) | |
| 33 , ccutil_(ccutil) | |
| 34 , wildcard_unichar_id_(INVALID_UNICHAR_ID) | |
| 35 , apostrophe_unichar_id_(INVALID_UNICHAR_ID) | |
| 36 , question_unichar_id_(INVALID_UNICHAR_ID) | |
| 37 , slash_unichar_id_(INVALID_UNICHAR_ID) | |
| 38 , hyphen_unichar_id_(INVALID_UNICHAR_ID) | |
| 39 , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.", | |
| 40 getCCUtil()->params()) | |
| 41 , STRING_INIT_MEMBER(user_words_suffix, "", | |
| 42 "A suffix of user-provided words located in tessdata.", | |
| 43 getCCUtil()->params()) | |
| 44 , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.", | |
| 45 getCCUtil()->params()) | |
| 46 , STRING_INIT_MEMBER(user_patterns_suffix, "", | |
| 47 "A suffix of user-provided patterns located in " | |
| 48 "tessdata.", | |
| 49 getCCUtil()->params()) | |
| 50 , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params()) | |
| 51 , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params()) | |
| 52 , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", | |
| 53 getCCUtil()->params()) | |
| 54 , BOOL_INIT_MEMBER(load_punc_dawg, true, | |
| 55 "Load dawg with punctuation" | |
| 56 " patterns.", | |
| 57 getCCUtil()->params()) | |
| 58 , BOOL_INIT_MEMBER(load_number_dawg, true, | |
| 59 "Load dawg with number" | |
| 60 " patterns.", | |
| 61 getCCUtil()->params()) | |
| 62 , BOOL_INIT_MEMBER(load_bigram_dawg, true, | |
| 63 "Load dawg with special word " | |
| 64 "bigrams.", | |
| 65 getCCUtil()->params()) | |
| 66 , double_MEMBER(xheight_penalty_subscripts, 0.125, | |
| 67 "Score penalty (0.1 = 10%) added if there are subscripts " | |
| 68 "or superscripts in a word, but it is otherwise OK.", | |
| 69 getCCUtil()->params()) | |
| 70 , double_MEMBER(xheight_penalty_inconsistent, 0.25, | |
| 71 "Score penalty (0.1 = 10%) added if an xheight is " | |
| 72 "inconsistent.", | |
| 73 getCCUtil()->params()) | |
| 74 , double_MEMBER(segment_penalty_dict_frequent_word, 1.0, | |
| 75 "Score multiplier for word matches which have good case and" | |
| 76 " are frequent in the given language (lower is better).", | |
| 77 getCCUtil()->params()) | |
| 78 , double_MEMBER(segment_penalty_dict_case_ok, 1.1, | |
| 79 "Score multiplier for word matches that have good case " | |
| 80 "(lower is better).", | |
| 81 getCCUtil()->params()) | |
| 82 , double_MEMBER(segment_penalty_dict_case_bad, 1.3125, | |
| 83 "Default score multiplier for word matches, which may have " | |
| 84 "case issues (lower is better).", | |
| 85 getCCUtil()->params()) | |
| 86 , double_MEMBER(segment_penalty_dict_nonword, 1.25, | |
| 87 "Score multiplier for glyph fragment segmentations which " | |
| 88 "do not match a dictionary word (lower is better).", | |
| 89 getCCUtil()->params()) | |
| 90 , double_MEMBER(segment_penalty_garbage, 1.50, | |
| 91 "Score multiplier for poorly cased strings that are not in" | |
| 92 " the dictionary and generally look like garbage (lower is" | |
| 93 " better).", | |
| 94 getCCUtil()->params()) | |
| 95 , STRING_MEMBER(output_ambig_words_file, "", | |
| 96 "Output file for ambiguities found in the dictionary", getCCUtil()->params()) | |
| 97 , INT_MEMBER(dawg_debug_level, 0, | |
| 98 "Set to 1 for general debug info" | |
| 99 ", to 2 for more details, to 3 to see all the debug messages", | |
| 100 getCCUtil()->params()) | |
| 101 , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params()) | |
| 102 , BOOL_MEMBER(use_only_first_uft8_step, false, | |
| 103 "Use only the first UTF8 step of the given string" | |
| 104 " when computing log probabilities.", | |
| 105 getCCUtil()->params()) | |
| 106 , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params()) | |
| 107 , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words", | |
| 108 getCCUtil()->params()) | |
| 109 , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset", | |
| 110 getCCUtil()->params()) | |
| 111 , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word", | |
| 112 getCCUtil()->params()) | |
| 113 , double_MEMBER(stopper_certainty_per_char, -0.50, | |
| 114 "Certainty to add" | |
| 115 " for each dict char above small word size.", | |
| 116 getCCUtil()->params()) | |
| 117 , double_MEMBER(stopper_allowable_character_badness, 3.0, | |
| 118 "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params()) | |
| 119 , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params()) | |
| 120 , BOOL_MEMBER(stopper_no_acceptable_choices, false, | |
| 121 "Make AcceptableChoice() always return false. Useful" | |
| 122 " when there is a need to explore all segmentations", | |
| 123 getCCUtil()->params()) | |
| 124 , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list", | |
| 125 getCCUtil()->params()) | |
| 126 , STRING_MEMBER(word_to_debug, "", | |
| 127 "Word for which stopper debug" | |
| 128 " information should be printed to stdout", | |
| 129 getCCUtil()->params()) | |
| 130 , BOOL_MEMBER(segment_nonalphabetic_script, false, | |
| 131 "Don't use any alphabetic-specific tricks." | |
| 132 " Set to true in the traineddata config file for" | |
| 133 " scripts that are cursive or inherently fixed-pitch", | |
| 134 getCCUtil()->params()) | |
| 135 , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params()) | |
| 136 , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary", | |
| 137 getCCUtil()->params()) | |
| 138 , double_MEMBER(doc_dict_certainty_threshold, -2.25, | |
| 139 "Worst certainty for words that can be inserted into the" | |
| 140 " document dictionary", | |
| 141 getCCUtil()->params()) | |
| 142 , INT_MEMBER(max_permuter_attempts, 10000, | |
| 143 "Maximum number of different" | |
| 144 " character choices to consider during permutation." | |
| 145 " This limit is especially useful when user patterns" | |
| 146 " are specified, since overly generic patterns can result in" | |
| 147 " dawg search exploring an overly large number of options.", | |
| 148 getCCUtil()->params()) { | |
| 149 reject_offset_ = 0.0; | |
| 150 go_deeper_fxn_ = nullptr; | |
| 151 hyphen_word_ = nullptr; | |
| 152 last_word_on_line_ = false; | |
| 153 document_words_ = nullptr; | |
| 154 dawg_cache_ = nullptr; | |
| 155 dawg_cache_is_ours_ = false; | |
| 156 pending_words_ = nullptr; | |
| 157 bigram_dawg_ = nullptr; | |
| 158 freq_dawg_ = nullptr; | |
| 159 punc_dawg_ = nullptr; | |
| 160 unambig_dawg_ = nullptr; | |
| 161 wordseg_rating_adjust_factor_ = -1.0f; | |
| 162 output_ambig_words_file_ = nullptr; | |
| 163 } | |
| 164 | |
| 165 Dict::~Dict() { | |
| 166 End(); | |
| 167 delete hyphen_word_; | |
| 168 if (output_ambig_words_file_ != nullptr) { | |
| 169 fclose(output_ambig_words_file_); | |
| 170 } | |
| 171 } | |
| 172 | |
| 173 DawgCache *Dict::GlobalDawgCache() { | |
| 174 // This global cache (a singleton) will outlive every Tesseract instance | |
| 175 // (even those that someone else might declare as global static variables). | |
| 176 static DawgCache cache; | |
| 177 return &cache; | |
| 178 } | |
| 179 | |
| 180 // Sets up ready for a Load or LoadLSTM. | |
| 181 void Dict::SetupForLoad(DawgCache *dawg_cache) { | |
| 182 if (dawgs_.size() != 0) { | |
| 183 this->End(); | |
| 184 } | |
| 185 | |
| 186 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); | |
| 187 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol); | |
| 188 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol); | |
| 189 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); | |
| 190 | |
| 191 if (dawg_cache != nullptr) { | |
| 192 dawg_cache_ = dawg_cache; | |
| 193 dawg_cache_is_ours_ = false; | |
| 194 } else { | |
| 195 dawg_cache_ = new DawgCache(); | |
| 196 dawg_cache_is_ours_ = true; | |
| 197 } | |
| 198 } | |
| 199 | |
| 200 // Loads the dawgs needed by Tesseract. Call FinishLoad() after. | |
| 201 void Dict::Load(const std::string &lang, TessdataManager *data_file) { | |
| 202 // Load dawgs_. | |
| 203 if (load_punc_dawg) { | |
| 204 punc_dawg_ = | |
| 205 dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file); | |
| 206 if (punc_dawg_) { | |
| 207 dawgs_.push_back(punc_dawg_); | |
| 208 } | |
| 209 } | |
| 210 if (load_system_dawg) { | |
| 211 Dawg *system_dawg = | |
| 212 dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); | |
| 213 if (system_dawg) { | |
| 214 dawgs_.push_back(system_dawg); | |
| 215 } | |
| 216 } | |
| 217 if (load_number_dawg) { | |
| 218 Dawg *number_dawg = | |
| 219 dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); | |
| 220 if (number_dawg) { | |
| 221 dawgs_.push_back(number_dawg); | |
| 222 } | |
| 223 } | |
| 224 if (load_bigram_dawg) { | |
| 225 bigram_dawg_ = | |
| 226 dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file); | |
| 227 // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the | |
| 228 // dawgs_!! | |
| 229 } | |
| 230 if (load_freq_dawg) { | |
| 231 freq_dawg_ = | |
| 232 dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file); | |
| 233 if (freq_dawg_) { | |
| 234 dawgs_.push_back(freq_dawg_); | |
| 235 } | |
| 236 } | |
| 237 if (load_unambig_dawg) { | |
| 238 unambig_dawg_ = | |
| 239 dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file); | |
| 240 if (unambig_dawg_) { | |
| 241 dawgs_.push_back(unambig_dawg_); | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 std::string name; | |
| 246 if (!user_words_suffix.empty() || !user_words_file.empty()) { | |
| 247 Trie *trie_ptr = | |
| 248 new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level); | |
| 249 if (!user_words_file.empty()) { | |
| 250 name = user_words_file; | |
| 251 } else { | |
| 252 name = getCCUtil()->language_data_path_prefix; | |
| 253 name += user_words_suffix; | |
| 254 } | |
| 255 if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), | |
| 256 Trie::RRP_REVERSE_IF_HAS_RTL)) { | |
| 257 tprintf("Error: failed to load %s\n", name.c_str()); | |
| 258 delete trie_ptr; | |
| 259 } else { | |
| 260 dawgs_.push_back(trie_ptr); | |
| 261 } | |
| 262 } | |
| 263 | |
| 264 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { | |
| 265 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(), | |
| 266 dawg_debug_level); | |
| 267 trie_ptr->initialize_patterns(&(getUnicharset())); | |
| 268 if (!user_patterns_file.empty()) { | |
| 269 name = user_patterns_file; | |
| 270 } else { | |
| 271 name = getCCUtil()->language_data_path_prefix; | |
| 272 name += user_patterns_suffix; | |
| 273 } | |
| 274 if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { | |
| 275 tprintf("Error: failed to load %s\n", name.c_str()); | |
| 276 delete trie_ptr; | |
| 277 } else { | |
| 278 dawgs_.push_back(trie_ptr); | |
| 279 } | |
| 280 } | |
| 281 | |
| 282 document_words_ = | |
| 283 new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level); | |
| 284 dawgs_.push_back(document_words_); | |
| 285 | |
| 286 // This dawg is temporary and should not be searched by letter_is_ok. | |
| 287 pending_words_ = | |
| 288 new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level); | |
| 289 } | |
| 290 | |
| 291 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. | |
| 292 void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { | |
| 293 // Load dawgs_. | |
| 294 if (load_punc_dawg) { | |
| 295 punc_dawg_ = | |
| 296 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file); | |
| 297 if (punc_dawg_) { | |
| 298 dawgs_.push_back(punc_dawg_); | |
| 299 } | |
| 300 } | |
| 301 if (load_system_dawg) { | |
| 302 Dawg *system_dawg = | |
| 303 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); | |
| 304 if (system_dawg) { | |
| 305 dawgs_.push_back(system_dawg); | |
| 306 } | |
| 307 } | |
| 308 if (load_number_dawg) { | |
| 309 Dawg *number_dawg = | |
| 310 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); | |
| 311 if (number_dawg) { | |
| 312 dawgs_.push_back(number_dawg); | |
| 313 } | |
| 314 } | |
| 315 | |
| 316 // stolen from Dict::Load (but needs params_ from Tesseract | |
| 317 // langdata/config/api): | |
| 318 std::string name; | |
| 319 if (!user_words_suffix.empty() || !user_words_file.empty()) { | |
| 320 Trie *trie_ptr = | |
| 321 new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level); | |
| 322 if (!user_words_file.empty()) { | |
| 323 name = user_words_file; | |
| 324 } else { | |
| 325 name = getCCUtil()->language_data_path_prefix; | |
| 326 name += user_words_suffix; | |
| 327 } | |
| 328 if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), | |
| 329 Trie::RRP_REVERSE_IF_HAS_RTL)) { | |
| 330 tprintf("Error: failed to load %s\n", name.c_str()); | |
| 331 delete trie_ptr; | |
| 332 } else { | |
| 333 dawgs_.push_back(trie_ptr); | |
| 334 } | |
| 335 } | |
| 336 | |
| 337 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { | |
| 338 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(), | |
| 339 dawg_debug_level); | |
| 340 trie_ptr->initialize_patterns(&(getUnicharset())); | |
| 341 if (!user_patterns_file.empty()) { | |
| 342 name = user_patterns_file; | |
| 343 } else { | |
| 344 name = getCCUtil()->language_data_path_prefix; | |
| 345 name += user_patterns_suffix; | |
| 346 } | |
| 347 if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { | |
| 348 tprintf("Error: failed to load %s\n", name.c_str()); | |
| 349 delete trie_ptr; | |
| 350 } else { | |
| 351 dawgs_.push_back(trie_ptr); | |
| 352 } | |
| 353 } | |
| 354 } | |
| 355 | |
| 356 // Completes the loading process after Load() and/or LoadLSTM(). | |
| 357 // Returns false if no dictionaries were loaded. | |
| 358 bool Dict::FinishLoad() { | |
| 359 if (dawgs_.empty()) { | |
| 360 return false; | |
| 361 } | |
| 362 // Construct a list of corresponding successors for each dawg. Each entry, i, | |
| 363 // in the successors_ vector is a vector of integers that represent the | |
| 364 // indices into the dawgs_ vector of the successors for dawg i. | |
| 365 successors_.reserve(dawgs_.size()); | |
| 366 for (auto dawg : dawgs_) { | |
| 367 auto *lst = new SuccessorList(); | |
| 368 for (unsigned j = 0; j < dawgs_.size(); ++j) { | |
| 369 const Dawg *other = dawgs_[j]; | |
| 370 if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) && | |
| 371 kDawgSuccessors[dawg->type()][other->type()]) { | |
| 372 lst->push_back(j); | |
| 373 } | |
| 374 } | |
| 375 successors_.push_back(lst); | |
| 376 } | |
| 377 return true; | |
| 378 } | |
| 379 | |
| 380 void Dict::End() { | |
| 381 if (dawgs_.empty()) { | |
| 382 return; // Not safe to call twice. | |
| 383 } | |
| 384 for (auto &dawg : dawgs_) { | |
| 385 if (!dawg_cache_->FreeDawg(dawg)) { | |
| 386 delete dawg; | |
| 387 } | |
| 388 } | |
| 389 dawg_cache_->FreeDawg(bigram_dawg_); | |
| 390 if (dawg_cache_is_ours_) { | |
| 391 delete dawg_cache_; | |
| 392 dawg_cache_ = nullptr; | |
| 393 } | |
| 394 for (auto successor : successors_) { | |
| 395 delete successor; | |
| 396 } | |
| 397 dawgs_.clear(); | |
| 398 successors_.clear(); | |
| 399 document_words_ = nullptr; | |
| 400 delete pending_words_; | |
| 401 pending_words_ = nullptr; | |
| 402 } | |
| 403 | |
| 404 // Returns true if in light of the current state unichar_id is allowed | |
| 405 // according to at least one of the dawgs in the dawgs_ vector. | |
| 406 // See more extensive comments in dict.h where this function is declared. | |
| 407 int Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, | |
| 408 UNICHAR_ID unichar_id, bool word_end) const { | |
| 409 auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args); | |
| 410 | |
| 411 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); | |
| 412 | |
| 413 if (dawg_debug_level >= 3) { | |
| 414 tesserr << "def_letter_is_okay: current unichar=" | |
| 415 << getUnicharset().debug_str(unichar_id) | |
| 416 << " word_end=" << word_end | |
| 417 << " num active dawgs=" << dawg_args->active_dawgs->size() << '\n'; | |
| 418 } | |
| 419 | |
| 420 // Do not accept words that contain kPatternUnicharID. | |
| 421 // (otherwise pattern dawgs would not function correctly). | |
| 422 // Do not accept words containing INVALID_UNICHAR_IDs. | |
| 423 if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) { | |
| 424 dawg_args->permuter = NO_PERM; | |
| 425 return NO_PERM; | |
| 426 } | |
| 427 | |
| 428 // Initialization. | |
| 429 PermuterType curr_perm = NO_PERM; | |
| 430 dawg_args->updated_dawgs->clear(); | |
| 431 dawg_args->valid_end = false; | |
| 432 | |
| 433 // Go over the active_dawgs vector and insert DawgPosition records | |
| 434 // with the updated ref (an edge with the corresponding unichar id) into | |
| 435 // dawg_args->updated_pos. | |
| 436 for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) { | |
| 437 const DawgPosition &pos = (*dawg_args->active_dawgs)[a]; | |
| 438 const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr; | |
| 439 const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr; | |
| 440 | |
| 441 if (!dawg && !punc_dawg) { | |
| 442 // shouldn't happen. | |
| 443 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n"); | |
| 444 continue; | |
| 445 } | |
| 446 if (!dawg) { | |
| 447 // We're in the punctuation dawg. A core dawg has not been chosen. | |
| 448 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); | |
| 449 EDGE_REF punc_transition_edge = | |
| 450 punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end); | |
| 451 if (punc_transition_edge != NO_EDGE) { | |
| 452 // Find all successors, and see which can transition. | |
| 453 const SuccessorList &slist = *(successors_[pos.punc_index]); | |
| 454 for (int sdawg_index : slist) { | |
| 455 const Dawg *sdawg = dawgs_[sdawg_index]; | |
| 456 UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg); | |
| 457 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); | |
| 458 if (dawg_edge != NO_EDGE) { | |
| 459 if (dawg_debug_level >= 3) { | |
| 460 tprintf("Letter found in dawg %d\n", sdawg_index); | |
| 461 } | |
| 462 dawg_args->updated_dawgs->add_unique( | |
| 463 DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false), | |
| 464 dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: "); | |
| 465 if (sdawg->permuter() > curr_perm) { | |
| 466 curr_perm = sdawg->permuter(); | |
| 467 } | |
| 468 if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) { | |
| 469 dawg_args->valid_end = true; | |
| 470 } | |
| 471 } | |
| 472 } | |
| 473 } | |
| 474 EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end); | |
| 475 if (punc_edge != NO_EDGE) { | |
| 476 if (dawg_debug_level >= 3) { | |
| 477 tprintf("Letter found in punctuation dawg\n"); | |
| 478 } | |
| 479 dawg_args->updated_dawgs->add_unique( | |
| 480 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0, | |
| 481 "Extend punctuation dawg: "); | |
| 482 if (PUNC_PERM > curr_perm) { | |
| 483 curr_perm = PUNC_PERM; | |
| 484 } | |
| 485 if (punc_dawg->end_of_word(punc_edge)) { | |
| 486 dawg_args->valid_end = true; | |
| 487 } | |
| 488 } | |
| 489 continue; | |
| 490 } | |
| 491 | |
| 492 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) { | |
| 493 // We can end the main word here. | |
| 494 // If we can continue on the punc ref, add that possibility. | |
| 495 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); | |
| 496 EDGE_REF punc_edge = | |
| 497 punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); | |
| 498 if (punc_edge != NO_EDGE) { | |
| 499 dawg_args->updated_dawgs->add_unique( | |
| 500 DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true), | |
| 501 dawg_debug_level > 0, "Return to punctuation dawg: "); | |
| 502 if (dawg->permuter() > curr_perm) { | |
| 503 curr_perm = dawg->permuter(); | |
| 504 } | |
| 505 if (punc_dawg->end_of_word(punc_edge)) { | |
| 506 dawg_args->valid_end = true; | |
| 507 } | |
| 508 } | |
| 509 } | |
| 510 | |
| 511 if (pos.back_to_punc) { | |
| 512 continue; | |
| 513 } | |
| 514 | |
| 515 // If we are dealing with the pattern dawg, look up all the | |
| 516 // possible edges, not only for the exact unichar_id, but also | |
| 517 // for all its character classes (alpha, digit, etc). | |
| 518 if (dawg->type() == DAWG_TYPE_PATTERN) { | |
| 519 ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm); | |
| 520 // There can't be any successors to dawg that is of type | |
| 521 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. | |
| 522 continue; | |
| 523 } | |
| 524 | |
| 525 // Find the edge out of the node for the unichar_id. | |
| 526 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); | |
| 527 EDGE_REF edge = | |
| 528 (node == NO_EDGE) | |
| 529 ? NO_EDGE | |
| 530 : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end); | |
| 531 | |
| 532 if (dawg_debug_level >= 3) { | |
| 533 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge); | |
| 534 } | |
| 535 | |
| 536 if (edge != NO_EDGE) { // the unichar was found in the current dawg | |
| 537 if (dawg_debug_level >= 3) { | |
| 538 tprintf("Letter found in dawg %d\n", pos.dawg_index); | |
| 539 } | |
| 540 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { | |
| 541 if (dawg_debug_level >= 3) { | |
| 542 tprintf("Punctuation constraint not satisfied at end of word.\n"); | |
| 543 } | |
| 544 continue; | |
| 545 } | |
| 546 if (dawg->permuter() > curr_perm) { | |
| 547 curr_perm = dawg->permuter(); | |
| 548 } | |
| 549 if (dawg->end_of_word(edge) && | |
| 550 (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) { | |
| 551 dawg_args->valid_end = true; | |
| 552 } | |
| 553 dawg_args->updated_dawgs->add_unique( | |
| 554 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false), | |
| 555 dawg_debug_level > 0, "Append current dawg to updated active dawgs: "); | |
| 556 } | |
| 557 } // end for | |
| 558 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM | |
| 559 // or if we found the current letter in a non-punctuation dawg. This | |
| 560 // allows preserving information on which dawg the "core" word came from. | |
| 561 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. | |
| 562 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || | |
| 563 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { | |
| 564 dawg_args->permuter = curr_perm; | |
| 565 } | |
| 566 if (dawg_debug_level >= 2) { | |
| 567 tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter); | |
| 568 } | |
| 569 return dawg_args->permuter; | |
| 570 } | |
| 571 | |
| 572 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id, | |
| 573 bool word_end, DawgArgs *dawg_args, PermuterType *curr_perm) const { | |
| 574 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); | |
| 575 // Try to find the edge corresponding to the exact unichar_id and to all the | |
| 576 // edges corresponding to the character class of unichar_id. | |
| 577 std::vector<UNICHAR_ID> unichar_id_patterns; | |
| 578 unichar_id_patterns.push_back(unichar_id); | |
| 579 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns); | |
| 580 for (int unichar_id_pattern : unichar_id_patterns) { | |
| 581 // On the first iteration check all the outgoing edges. | |
| 582 // On the second iteration check all self-loops. | |
| 583 for (int k = 0; k < 2; ++k) { | |
| 584 EDGE_REF edge = (k == 0) | |
| 585 ? dawg->edge_char_of(node, unichar_id_pattern, word_end) | |
| 586 : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end); | |
| 587 if (edge == NO_EDGE) { | |
| 588 continue; | |
| 589 } | |
| 590 if (dawg_debug_level >= 3) { | |
| 591 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, | |
| 592 edge); | |
| 593 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); | |
| 594 } | |
| 595 if (dawg->permuter() > *curr_perm) { | |
| 596 *curr_perm = dawg->permuter(); | |
| 597 } | |
| 598 if (dawg->end_of_word(edge)) { | |
| 599 dawg_args->valid_end = true; | |
| 600 } | |
| 601 dawg_args->updated_dawgs->add_unique( | |
| 602 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc), | |
| 603 dawg_debug_level > 0, "Append current dawg to updated active dawgs: "); | |
| 604 } | |
| 605 } | |
| 606 } | |
| 607 | |
| 608 // Fill the given active_dawgs vector with dawgs that could contain the | |
| 609 // beginning of the word. If hyphenated() returns true, copy the entries | |
| 610 // from hyphen_active_dawgs_ instead. | |
| 611 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const { | |
| 612 if (hyphenated()) { | |
| 613 *active_dawgs = hyphen_active_dawgs_; | |
| 614 if (dawg_debug_level >= 3) { | |
| 615 for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) { | |
| 616 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", | |
| 617 hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref); | |
| 618 } | |
| 619 } | |
| 620 } else { | |
| 621 default_dawgs(active_dawgs, ambigs_mode); | |
| 622 } | |
| 623 } | |
| 624 | |
| 625 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_patterns) const { | |
| 626 bool punc_dawg_available = (punc_dawg_ != nullptr) && | |
| 627 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; | |
| 628 | |
| 629 for (unsigned i = 0; i < dawgs_.size(); i++) { | |
| 630 if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { | |
| 631 int dawg_ty = dawgs_[i]->type(); | |
| 632 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; | |
| 633 if (dawg_ty == DAWG_TYPE_PUNCTUATION) { | |
| 634 dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false)); | |
| 635 if (dawg_debug_level >= 3) { | |
| 636 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); | |
| 637 } | |
| 638 } else if (!punc_dawg_available || !subsumed_by_punc) { | |
| 639 dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false)); | |
| 640 if (dawg_debug_level >= 3) { | |
| 641 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); | |
| 642 } | |
| 643 } | |
| 644 } | |
| 645 } | |
| 646 } | |
| 647 | |
| 648 void Dict::add_document_word(const WERD_CHOICE &best_choice) { | |
| 649 // Do not add hyphenated word parts to the document dawg. | |
| 650 // hyphen_word_ will be non-nullptr after the set_hyphen_word() is | |
| 651 // called when the first part of the hyphenated word is | |
| 652 // discovered and while the second part of the word is recognized. | |
| 653 // hyphen_word_ is cleared in cc_recg() before the next word on | |
| 654 // the line is recognized. | |
| 655 if (hyphen_word_) { | |
| 656 return; | |
| 657 } | |
| 658 | |
| 659 int stringlen = best_choice.length(); | |
| 660 | |
| 661 if (valid_word(best_choice) || stringlen < 2) { | |
| 662 return; | |
| 663 } | |
| 664 | |
| 665 // Discard words that contain >= kDocDictMaxRepChars repeating unichars. | |
| 666 if (best_choice.length() >= kDocDictMaxRepChars) { | |
| 667 int num_rep_chars = 1; | |
| 668 UNICHAR_ID uch_id = best_choice.unichar_id(0); | |
| 669 for (unsigned i = 1; i < best_choice.length(); ++i) { | |
| 670 if (best_choice.unichar_id(i) != uch_id) { | |
| 671 num_rep_chars = 1; | |
| 672 uch_id = best_choice.unichar_id(i); | |
| 673 } else { | |
| 674 ++num_rep_chars; | |
| 675 if (num_rep_chars == kDocDictMaxRepChars) { | |
| 676 return; | |
| 677 } | |
| 678 } | |
| 679 } | |
| 680 } | |
| 681 | |
| 682 if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) { | |
| 683 if (best_choice.certainty() < doc_dict_pending_threshold) { | |
| 684 return; | |
| 685 } | |
| 686 | |
| 687 if (!pending_words_->word_in_dawg(best_choice)) { | |
| 688 if (stringlen > 2 || | |
| 689 (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) && | |
| 690 getUnicharset().get_isupper(best_choice.unichar_id(1)))) { | |
| 691 pending_words_->add_word_to_dawg(best_choice); | |
| 692 } | |
| 693 return; | |
| 694 } | |
| 695 } | |
| 696 | |
| 697 if (save_doc_words) { | |
| 698 std::string filename(getCCUtil()->imagefile); | |
| 699 filename += ".doc"; | |
| 700 FILE *doc_word_file = fopen(filename.c_str(), "a"); | |
| 701 if (doc_word_file == nullptr) { | |
| 702 tprintf("Error: Could not open file %s\n", filename.c_str()); | |
| 703 ASSERT_HOST(doc_word_file); | |
| 704 } | |
| 705 fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str()); | |
| 706 fclose(doc_word_file); | |
| 707 } | |
| 708 document_words_->add_word_to_dawg(best_choice); | |
| 709 } | |
| 710 | |
| 711 void Dict::adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, | |
| 712 float additional_adjust, bool modify_rating, bool debug) { | |
| 713 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && | |
| 714 word->GetTopScriptID() == getUnicharset().han_sid()); | |
| 715 bool case_is_ok = (is_han || case_ok(*word)); | |
| 716 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); | |
| 717 | |
| 718 float adjust_factor = additional_adjust; | |
| 719 float new_rating = word->rating(); | |
| 720 new_rating += kRatingPad; | |
| 721 const char *xheight_triggered = ""; | |
| 722 if (word->length() > 1) { | |
| 723 // Calculate x-height and y-offset consistency penalties. | |
| 724 switch (xheight_consistency) { | |
| 725 case XH_INCONSISTENT: | |
| 726 adjust_factor += xheight_penalty_inconsistent; | |
| 727 xheight_triggered = ", xhtBAD"; | |
| 728 break; | |
| 729 case XH_SUBNORMAL: | |
| 730 adjust_factor += xheight_penalty_subscripts; | |
| 731 xheight_triggered = ", xhtSUB"; | |
| 732 break; | |
| 733 case XH_GOOD: | |
| 734 // leave the factor alone - all good! | |
| 735 break; | |
| 736 } | |
| 737 // TODO(eger): if nonword is true, but there is a "core" that is a dict | |
| 738 // word, negate nonword status. | |
| 739 } else { | |
| 740 if (debug) { | |
| 741 tprintf("Consistency could not be calculated.\n"); | |
| 742 } | |
| 743 } | |
| 744 if (debug) { | |
| 745 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(), | |
| 746 word->rating(), xheight_triggered); | |
| 747 } | |
| 748 | |
| 749 if (nonword) { // non-dictionary word | |
| 750 if (case_is_ok && punc_is_ok) { | |
| 751 adjust_factor += segment_penalty_dict_nonword; | |
| 752 new_rating *= adjust_factor; | |
| 753 if (debug) { | |
| 754 tprintf(", W"); | |
| 755 } | |
| 756 } else { | |
| 757 adjust_factor += segment_penalty_garbage; | |
| 758 new_rating *= adjust_factor; | |
| 759 if (debug) { | |
| 760 if (!case_is_ok) { | |
| 761 tprintf(", C"); | |
| 762 } | |
| 763 if (!punc_is_ok) { | |
| 764 tprintf(", P"); | |
| 765 } | |
| 766 } | |
| 767 } | |
| 768 } else { // dictionary word | |
| 769 if (case_is_ok) { | |
| 770 if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) { | |
| 771 word->set_permuter(FREQ_DAWG_PERM); | |
| 772 adjust_factor += segment_penalty_dict_frequent_word; | |
| 773 new_rating *= adjust_factor; | |
| 774 if (debug) { | |
| 775 tprintf(", F"); | |
| 776 } | |
| 777 } else { | |
| 778 adjust_factor += segment_penalty_dict_case_ok; | |
| 779 new_rating *= adjust_factor; | |
| 780 if (debug) { | |
| 781 tprintf(", "); | |
| 782 } | |
| 783 } | |
| 784 } else { | |
| 785 adjust_factor += segment_penalty_dict_case_bad; | |
| 786 new_rating *= adjust_factor; | |
| 787 if (debug) { | |
| 788 tprintf(", C"); | |
| 789 } | |
| 790 } | |
| 791 } | |
| 792 new_rating -= kRatingPad; | |
| 793 if (modify_rating) { | |
| 794 word->set_rating(new_rating); | |
| 795 } | |
| 796 if (debug) { | |
| 797 tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); | |
| 798 } | |
| 799 word->set_adjust_factor(adjust_factor); | |
| 800 } | |
| 801 | |
| 802 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { | |
| 803 const WERD_CHOICE *word_ptr = &word; | |
| 804 WERD_CHOICE temp_word(word.unicharset()); | |
| 805 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { | |
| 806 copy_hyphen_info(&temp_word); | |
| 807 temp_word += word; | |
| 808 word_ptr = &temp_word; | |
| 809 } | |
| 810 if (word_ptr->empty()) { | |
| 811 return NO_PERM; | |
| 812 } | |
| 813 // Allocate vectors for holding current and updated | |
| 814 // active_dawgs and initialize them. | |
| 815 DawgPositionVector active_dawgs[2]; | |
| 816 init_active_dawgs(&(active_dawgs[0]), false); | |
| 817 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); | |
| 818 int last_index = word_ptr->length() - 1; | |
| 819 // Call letter_is_okay for each letter in the word. | |
| 820 for (int i = hyphen_base_size(); i <= last_index; ++i) { | |
| 821 if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i), | |
| 822 i == last_index))) { | |
| 823 break; | |
| 824 } | |
| 825 // Swap active_dawgs, constraints with the corresponding updated vector. | |
| 826 if (dawg_args.updated_dawgs == &(active_dawgs[1])) { | |
| 827 dawg_args.updated_dawgs = &(active_dawgs[0]); | |
| 828 ++(dawg_args.active_dawgs); | |
| 829 } else { | |
| 830 ++(dawg_args.updated_dawgs); | |
| 831 dawg_args.active_dawgs = &(active_dawgs[0]); | |
| 832 } | |
| 833 } | |
| 834 return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM; | |
| 835 } | |
| 836 | |
| 837 bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const { | |
| 838 if (bigram_dawg_ == nullptr) { | |
| 839 return false; | |
| 840 } | |
| 841 | |
| 842 // Extract the core word from the middle of each word with any digits | |
| 843 // replaced with question marks. | |
| 844 unsigned w1start, w1end, w2start, w2end; | |
| 845 word1.punct_stripped(&w1start, &w1end); | |
| 846 word2.punct_stripped(&w2start, &w2end); | |
| 847 | |
| 848 // We don't want to penalize a single guillemet, hyphen, etc. | |
| 849 // But our bigram list doesn't have any information about punctuation. | |
| 850 if (w1start >= w1end) { | |
| 851 return word1.length() < 3; | |
| 852 } | |
| 853 if (w2start >= w2end) { | |
| 854 return word2.length() < 3; | |
| 855 } | |
| 856 | |
| 857 const UNICHARSET &uchset = getUnicharset(); | |
| 858 std::vector<UNICHAR_ID> bigram_string; | |
| 859 bigram_string.reserve(w1end + w2end + 1); | |
| 860 for (auto i = w1start; i < w1end; i++) { | |
| 861 const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i)); | |
| 862 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) { | |
| 863 bigram_string.push_back(question_unichar_id_); | |
| 864 } else { | |
| 865 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); | |
| 866 } | |
| 867 } | |
| 868 bigram_string.push_back(UNICHAR_SPACE); | |
| 869 for (auto i = w2start; i < w2end; i++) { | |
| 870 const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i)); | |
| 871 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) { | |
| 872 bigram_string.push_back(question_unichar_id_); | |
| 873 } else { | |
| 874 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); | |
| 875 } | |
| 876 } | |
| 877 WERD_CHOICE normalized_word(&uchset, bigram_string.size()); | |
| 878 for (int i : bigram_string) { | |
| 879 normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f); | |
| 880 } | |
| 881 return bigram_dawg_->word_in_dawg(normalized_word); | |
| 882 } | |
| 883 | |
| 884 bool Dict::valid_punctuation(const WERD_CHOICE &word) { | |
| 885 if (word.empty()) { | |
| 886 return NO_PERM; | |
| 887 } | |
| 888 WERD_CHOICE new_word(word.unicharset()); | |
| 889 auto last_index = word.length() - 1; | |
| 890 int new_len; | |
| 891 for (unsigned i = 0; i <= last_index; ++i) { | |
| 892 UNICHAR_ID unichar_id = (word.unichar_id(i)); | |
| 893 if (getUnicharset().get_ispunctuation(unichar_id)) { | |
| 894 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); | |
| 895 } else if (!getUnicharset().get_isalpha(unichar_id) && | |
| 896 !getUnicharset().get_isdigit(unichar_id)) { | |
| 897 return false; // neither punc, nor alpha, nor digit | |
| 898 } else if ((new_len = new_word.length()) == 0 || | |
| 899 new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) { | |
| 900 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); | |
| 901 } | |
| 902 } | |
| 903 for (unsigned i = 0; i < dawgs_.size(); ++i) { | |
| 904 if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && | |
| 905 dawgs_[i]->word_in_dawg(new_word)) { | |
| 906 return true; | |
| 907 } | |
| 908 } | |
| 909 return false; | |
| 910 } | |
| 911 | |
| 912 /// Returns true if the language is space-delimited (not CJ, or T). | |
| 913 bool Dict::IsSpaceDelimitedLang() const { | |
| 914 const UNICHARSET &u_set = getUnicharset(); | |
| 915 if (u_set.han_sid() > 0) { | |
| 916 return false; | |
| 917 } | |
| 918 if (u_set.katakana_sid() > 0) { | |
| 919 return false; | |
| 920 } | |
| 921 if (u_set.thai_sid() > 0) { | |
| 922 return false; | |
| 923 } | |
| 924 return true; | |
| 925 } | |
| 926 | |
| 927 } // namespace tesseract |
