Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/tfacepp.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: tfacepp.cpp (Formerly tface++.c) | |
| 3 * Description: C++ side of the C/C++ Tess/Editor interface. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include <cmath> | |
| 20 | |
| 21 #include "blamer.h" | |
| 22 #include "errcode.h" | |
| 23 #include "ratngs.h" | |
| 24 #include "reject.h" | |
| 25 #include "tesseractclass.h" | |
| 26 #include "werd.h" | |
| 27 | |
| 28 #define MAX_UNDIVIDED_LENGTH 24 | |
| 29 | |
| 30 /********************************************************************** | |
| 31 * recog_word | |
| 32 * | |
| 33 * Convert the word to tess form and pass it to the tess segmenter. | |
| 34 * Convert the output back to editor form. | |
| 35 **********************************************************************/ | |
| 36 namespace tesseract { | |
| 37 void Tesseract::recog_word(WERD_RES *word) { | |
| 38 if (wordrec_skip_no_truth_words && | |
| 39 (word->blamer_bundle == nullptr || | |
| 40 word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) { | |
| 41 if (classify_debug_level) { | |
| 42 tprintf("No truth for word - skipping\n"); | |
| 43 } | |
| 44 word->tess_failed = true; | |
| 45 return; | |
| 46 } | |
| 47 ASSERT_HOST(!word->chopped_word->blobs.empty()); | |
| 48 recog_word_recursive(word); | |
| 49 word->SetupBoxWord(); | |
| 50 ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length()); | |
| 51 // Check that the ratings matrix size matches the sum of all the | |
| 52 // segmentation states. | |
| 53 if (!word->StatesAllValid()) { | |
| 54 tprintf("Not all words have valid states relative to ratings matrix!!"); | |
| 55 word->DebugWordChoices(true, nullptr); | |
| 56 ASSERT_HOST(word->StatesAllValid()); | |
| 57 } | |
| 58 if (tessedit_override_permuter) { | |
| 59 /* Override the permuter type if a straight dictionary check disagrees. */ | |
| 60 uint8_t perm_type = word->best_choice->permuter(); | |
| 61 if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && | |
| 62 (perm_type != USER_DAWG_PERM)) { | |
| 63 uint8_t real_dict_perm_type = dict_word(*word->best_choice); | |
| 64 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || | |
| 65 (real_dict_perm_type == USER_DAWG_PERM)) && | |
| 66 (alpha_count(word->best_choice->unichar_string().c_str(), | |
| 67 word->best_choice->unichar_lengths().c_str()) > 0)) { | |
| 68 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm | |
| 69 } | |
| 70 } | |
| 71 if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) { | |
| 72 tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter()); | |
| 73 } | |
| 74 } | |
| 75 // Factored out from control.cpp | |
| 76 ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr)); | |
| 77 if (word->best_choice == nullptr || word->best_choice->empty() || | |
| 78 strspn(word->best_choice->unichar_string().c_str(), " ") == | |
| 79 word->best_choice->length()) { | |
| 80 word->tess_failed = true; | |
| 81 word->reject_map.initialise(word->box_word->length()); | |
| 82 word->reject_map.rej_word_tess_failure(); | |
| 83 } else { | |
| 84 word->tess_failed = false; | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 /********************************************************************** | |
| 89 * recog_word_recursive | |
| 90 * | |
| 91 * Convert the word to tess form and pass it to the tess segmenter. | |
| 92 * Convert the output back to editor form. | |
| 93 **********************************************************************/ | |
| 94 void Tesseract::recog_word_recursive(WERD_RES *word) { | |
| 95 auto word_length = word->chopped_word->NumBlobs(); // no of blobs | |
| 96 if (word_length > MAX_UNDIVIDED_LENGTH) { | |
| 97 return split_and_recog_word(word); | |
| 98 } | |
| 99 cc_recog(word); | |
| 100 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output. | |
| 101 | |
| 102 // Do sanity checks and minor fixes on best_choice. | |
| 103 if (word->best_choice->length() > word_length) { | |
| 104 word->best_choice->make_bad(); // should never happen | |
| 105 tprintf( | |
| 106 "recog_word: Discarded long string \"%s\"" | |
| 107 " (%d characters vs %d blobs)\n", | |
| 108 word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length); | |
| 109 tprintf("Word is at:"); | |
| 110 word->word->bounding_box().print(); | |
| 111 } | |
| 112 if (word->best_choice->length() < word_length) { | |
| 113 UNICHAR_ID space_id = unicharset.unichar_to_id(" "); | |
| 114 while (word->best_choice->length() < word_length) { | |
| 115 word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty()); | |
| 116 } | |
| 117 } | |
| 118 } | |
| 119 | |
| 120 /********************************************************************** | |
| 121 * split_and_recog_word | |
| 122 * | |
| 123 * Split the word into 2 smaller pieces at the largest gap. | |
| 124 * Recognize the pieces and stick the results back together. | |
| 125 **********************************************************************/ | |
| 126 void Tesseract::split_and_recog_word(WERD_RES *word) { | |
| 127 // Find the biggest blob gap in the chopped_word. | |
| 128 int bestgap = -INT32_MAX; | |
| 129 int split_index = 0; | |
| 130 for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) { | |
| 131 TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box(); | |
| 132 TBOX blob_box = word->chopped_word->blobs[b]->bounding_box(); | |
| 133 int gap = blob_box.left() - prev_box.right(); | |
| 134 if (gap > bestgap) { | |
| 135 bestgap = gap; | |
| 136 split_index = b; | |
| 137 } | |
| 138 } | |
| 139 ASSERT_HOST(split_index > 0); | |
| 140 | |
| 141 WERD_RES *word2 = nullptr; | |
| 142 BlamerBundle *orig_bb = nullptr; | |
| 143 split_word(word, split_index, &word2, &orig_bb); | |
| 144 | |
| 145 // Recognize the first part of the word. | |
| 146 recog_word_recursive(word); | |
| 147 // Recognize the second part of the word. | |
| 148 recog_word_recursive(word2); | |
| 149 | |
| 150 join_words(word, word2, orig_bb); | |
| 151 } | |
| 152 | |
| 153 /********************************************************************** | |
| 154 * split_word | |
| 155 * | |
| 156 * Split a given WERD_RES in place into two smaller words for recognition. | |
| 157 * split_pt is the index of the first blob to go in the second word. | |
| 158 * The underlying word is left alone, only the TWERD (and subsequent data) | |
| 159 * are split up. orig_blamer_bundle is set to the original blamer bundle, | |
| 160 * and will now be owned by the caller. New blamer bundles are forged for the | |
| 161 * two pieces. | |
| 162 **********************************************************************/ | |
| 163 void Tesseract::split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, | |
| 164 BlamerBundle **orig_blamer_bundle) const { | |
| 165 ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs()); | |
| 166 | |
| 167 // Save a copy of the blamer bundle so we can try to reconstruct it below. | |
| 168 BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr; | |
| 169 | |
| 170 auto *word2 = new WERD_RES(*word); | |
| 171 | |
| 172 // blow away the copied chopped_word, as we want to work with | |
| 173 // the blobs from the input chopped_word so seam_arrays can be merged. | |
| 174 TWERD *chopped = word->chopped_word; | |
| 175 auto *chopped2 = new TWERD; | |
| 176 chopped2->blobs.reserve(chopped->NumBlobs() - split_pt); | |
| 177 for (auto i = split_pt; i < chopped->NumBlobs(); ++i) { | |
| 178 chopped2->blobs.push_back(chopped->blobs[i]); | |
| 179 } | |
| 180 chopped->blobs.resize(split_pt); | |
| 181 word->chopped_word = nullptr; | |
| 182 delete word2->chopped_word; | |
| 183 word2->chopped_word = nullptr; | |
| 184 | |
| 185 const UNICHARSET &unicharset = *word->uch_set; | |
| 186 word->ClearResults(); | |
| 187 word2->ClearResults(); | |
| 188 word->chopped_word = chopped; | |
| 189 word2->chopped_word = chopped2; | |
| 190 word->SetupBasicsFromChoppedWord(unicharset); | |
| 191 word2->SetupBasicsFromChoppedWord(unicharset); | |
| 192 | |
| 193 // Try to adjust the blamer bundle. | |
| 194 if (orig_bb != nullptr) { | |
| 195 // TODO(rays) Looks like a leak to me. | |
| 196 // orig_bb should take, rather than copy. | |
| 197 word->blamer_bundle = new BlamerBundle(); | |
| 198 word2->blamer_bundle = new BlamerBundle(); | |
| 199 orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(), | |
| 200 word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer, | |
| 201 word->blamer_bundle, word2->blamer_bundle); | |
| 202 } | |
| 203 | |
| 204 *right_piece = word2; | |
| 205 *orig_blamer_bundle = orig_bb; | |
| 206 } | |
| 207 | |
| 208 /********************************************************************** | |
| 209 * join_words | |
| 210 * | |
| 211 * The opposite of split_word(): | |
| 212 * join word2 (including any recognized data / seam array / etc) | |
| 213 * onto the right of word and then delete word2. | |
| 214 * Also, if orig_bb is provided, stitch it back into word. | |
| 215 **********************************************************************/ | |
| 216 void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const { | |
| 217 TBOX prev_box = word->chopped_word->blobs.back()->bounding_box(); | |
| 218 TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box(); | |
| 219 // Tack the word2 outputs onto the end of the word outputs. | |
| 220 word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end()); | |
| 221 word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end()); | |
| 222 word2->chopped_word->blobs.clear(); | |
| 223 word2->rebuild_word->blobs.clear(); | |
| 224 TPOINT split_pt; | |
| 225 split_pt.x = (prev_box.right() + blob_box.left()) / 2; | |
| 226 split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4; | |
| 227 // Move the word2 seams onto the end of the word1 seam_array. | |
| 228 // Since the seam list is one element short, an empty seam marking the | |
| 229 // end of the last blob in the first word is needed first. | |
| 230 word->seam_array.push_back(new SEAM(0.0f, split_pt)); | |
| 231 word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end()); | |
| 232 word2->seam_array.clear(); | |
| 233 // Fix widths and gaps. | |
| 234 word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end()); | |
| 235 word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end()); | |
| 236 // Fix the ratings matrix. | |
| 237 int rat1 = word->ratings->dimension(); | |
| 238 int rat2 = word2->ratings->dimension(); | |
| 239 word->ratings->AttachOnCorner(word2->ratings); | |
| 240 ASSERT_HOST(word->ratings->dimension() == rat1 + rat2); | |
| 241 word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end()); | |
| 242 // Append the word choices. | |
| 243 *word->raw_choice += *word2->raw_choice; | |
| 244 | |
| 245 // How many alt choices from each should we try to get? | |
| 246 const int kAltsPerPiece = 2; | |
| 247 // When do we start throwing away extra alt choices? | |
| 248 const int kTooManyAltChoices = 100; | |
| 249 | |
| 250 // Construct the cartesian product of the best_choices of word(1) and word2. | |
| 251 WERD_CHOICE_LIST joined_choices; | |
| 252 WERD_CHOICE_IT jc_it(&joined_choices); | |
| 253 WERD_CHOICE_IT bc1_it(&word->best_choices); | |
| 254 WERD_CHOICE_IT bc2_it(&word2->best_choices); | |
| 255 int num_word1_choices = word->best_choices.length(); | |
| 256 int total_joined_choices = num_word1_choices; | |
| 257 // Nota Bene: For the main loop here, we operate only on the 2nd and greater | |
| 258 // word2 choices, and put them in the joined_choices list. The 1st word2 | |
| 259 // choice gets added to the original word1 choices in-place after we have | |
| 260 // finished with them. | |
| 261 int bc2_index = 1; | |
| 262 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) { | |
| 263 if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) { | |
| 264 break; | |
| 265 } | |
| 266 int bc1_index = 0; | |
| 267 for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) { | |
| 268 if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) { | |
| 269 break; | |
| 270 } | |
| 271 auto *wc = new WERD_CHOICE(*bc1_it.data()); | |
| 272 *wc += *bc2_it.data(); | |
| 273 jc_it.add_after_then_move(wc); | |
| 274 ++total_joined_choices; | |
| 275 } | |
| 276 } | |
| 277 // Now that we've filled in as many alternates as we want, paste the best | |
| 278 // choice for word2 onto the original word alt_choices. | |
| 279 bc1_it.move_to_first(); | |
| 280 bc2_it.move_to_first(); | |
| 281 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) { | |
| 282 *bc1_it.data() += *bc2_it.data(); | |
| 283 } | |
| 284 bc1_it.move_to_last(); | |
| 285 bc1_it.add_list_after(&joined_choices); | |
| 286 | |
| 287 // Restore the pointer to original blamer bundle and combine blamer | |
| 288 // information recorded in the splits. | |
| 289 if (orig_bb != nullptr) { | |
| 290 orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer); | |
| 291 delete word->blamer_bundle; | |
| 292 word->blamer_bundle = orig_bb; | |
| 293 } | |
| 294 word->SetupBoxWord(); | |
| 295 word->reject_map.initialise(word->box_word->length()); | |
| 296 delete word2; | |
| 297 } | |
| 298 | |
| 299 } // namespace tesseract |
