Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/segsearch.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: segsearch.cpp | |
| 3 // Description: Segmentation search functions. | |
| 4 // Author: Daria Antonova | |
| 5 // | |
| 6 // (C) Copyright 2009, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include <cstdint> // for INT32_MAX | |
| 20 #include "blamer.h" // for BlamerBundle | |
| 21 #include "errcode.h" // for ASSERT_HOST | |
| 22 #include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi... | |
| 23 #include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry | |
| 24 #include "matrix.h" // for MATRIX_COORD, MATRIX | |
| 25 #include "pageres.h" // for WERD_RES | |
| 26 #include "params.h" // for BoolParam, IntParam, DoubleParam | |
| 27 #include "ratngs.h" // for BLOB_CHOICE_LIST, BLOB_CHOICE_IT | |
| 28 #include "tprintf.h" // for tprintf | |
| 29 #include "wordrec.h" // for Wordrec, SegSearchPending (ptr only) | |
| 30 | |
| 31 namespace tesseract { | |
| 32 | |
| 33 void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, | |
| 34 BlamerBundle *blamer_bundle) { | |
| 35 LMPainPoints pain_points(segsearch_max_pain_points, segsearch_max_char_wh_ratio, | |
| 36 assume_fixed_pitch_char_segment, &getDict(), segsearch_debug_level); | |
| 37 // Compute scaling factor that will help us recover blob outline length | |
| 38 // from classifier rating and certainty for the blob. | |
| 39 float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; | |
| 40 std::vector<SegSearchPending> pending; | |
| 41 InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle); | |
| 42 | |
| 43 if (!SegSearchDone(0)) { // find a better choice | |
| 44 if (chop_enable && word_res->chopped_word != nullptr) { | |
| 45 improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle, | |
| 46 &pain_points, &pending); | |
| 47 } | |
| 48 if (chop_debug) { | |
| 49 SEAM::PrintSeams("Final seam list:", word_res->seam_array); | |
| 50 } | |
| 51 | |
| 52 if (blamer_bundle != nullptr && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) { | |
| 53 blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer); | |
| 54 } | |
| 55 } | |
| 56 // Keep trying to find a better path by fixing the "pain points". | |
| 57 | |
| 58 MATRIX_COORD pain_point; | |
| 59 float pain_point_priority; | |
| 60 int num_futile_classifications = 0; | |
| 61 std::string blamer_debug; | |
| 62 while (wordrec_enable_assoc && | |
| 63 (!SegSearchDone(num_futile_classifications) || | |
| 64 (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()))) { | |
| 65 // Get the next valid "pain point". | |
| 66 bool found_nothing = true; | |
| 67 LMPainPointsType pp_type; | |
| 68 while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) != LM_PPTYPE_NUM) { | |
| 69 if (!pain_point.Valid(*word_res->ratings)) { | |
| 70 word_res->ratings->IncreaseBandSize(pain_point.row - pain_point.col + 1); | |
| 71 } | |
| 72 if (pain_point.Valid(*word_res->ratings) && | |
| 73 !word_res->ratings->Classified(pain_point.col, pain_point.row, getDict().WildcardID())) { | |
| 74 found_nothing = false; | |
| 75 break; | |
| 76 } | |
| 77 } | |
| 78 if (found_nothing) { | |
| 79 if (segsearch_debug_level > 0) { | |
| 80 tprintf("Pain points queue is empty\n"); | |
| 81 } | |
| 82 break; | |
| 83 } | |
| 84 ProcessSegSearchPainPoint(pain_point_priority, pain_point, | |
| 85 LMPainPoints::PainPointDescription(pp_type), &pending, word_res, | |
| 86 &pain_points, blamer_bundle); | |
| 87 | |
| 88 UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending, word_res, &pain_points, | |
| 89 best_choice_bundle, blamer_bundle); | |
| 90 if (!best_choice_bundle->updated) { | |
| 91 ++num_futile_classifications; | |
| 92 } | |
| 93 | |
| 94 if (segsearch_debug_level > 0) { | |
| 95 tprintf("num_futile_classifications %d\n", num_futile_classifications); | |
| 96 } | |
| 97 | |
| 98 best_choice_bundle->updated = false; // reset updated | |
| 99 | |
| 100 // See if it's time to terminate SegSearch or time for starting a guided | |
| 101 // search for the true path to find the blame for the incorrect best_choice. | |
| 102 if (SegSearchDone(num_futile_classifications) && blamer_bundle != nullptr && | |
| 103 blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) { | |
| 104 InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle, blamer_debug); | |
| 105 } | |
| 106 } // end while loop exploring alternative paths | |
| 107 if (blamer_bundle != nullptr) { | |
| 108 blamer_bundle->FinishSegSearch(word_res->best_choice, wordrec_debug_blamer, blamer_debug); | |
| 109 } | |
| 110 | |
| 111 if (segsearch_debug_level > 0) { | |
| 112 tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n", | |
| 113 language_model_->AcceptableChoiceFound()); | |
| 114 } | |
| 115 } | |
| 116 | |
| 117 // Setup and run just the initial segsearch on an established matrix, | |
| 118 // without doing any additional chopping or joining. | |
| 119 // (Internal factored version that can be used as part of the main SegSearch.) | |
| 120 void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, | |
| 121 std::vector<SegSearchPending> *pending, | |
| 122 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) { | |
| 123 if (segsearch_debug_level > 0) { | |
| 124 tprintf("Starting SegSearch on ratings matrix%s:\n", | |
| 125 wordrec_enable_assoc ? " (with assoc)" : ""); | |
| 126 word_res->ratings->print(getDict().getUnicharset()); | |
| 127 } | |
| 128 | |
| 129 pain_points->GenerateInitial(word_res); | |
| 130 | |
| 131 // Compute scaling factor that will help us recover blob outline length | |
| 132 // from classifier rating and certainty for the blob. | |
| 133 float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; | |
| 134 | |
| 135 language_model_->InitForWord(prev_word_best_choice_, assume_fixed_pitch_char_segment, | |
| 136 segsearch_max_char_wh_ratio, rating_cert_scale); | |
| 137 | |
| 138 // Initialize blamer-related information: map character boxes recorded in | |
| 139 // blamer_bundle->norm_truth_word to the corresponding i,j indices in the | |
| 140 // ratings matrix. We expect this step to succeed, since when running the | |
| 141 // chopper we checked that the correct chops are present. | |
| 142 if (blamer_bundle != nullptr) { | |
| 143 blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word, wordrec_debug_blamer); | |
| 144 } | |
| 145 | |
| 146 // pending[col] tells whether there is update work to do to combine | |
| 147 // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *]. | |
| 148 // As the language model state is updated, pending entries are modified to | |
| 149 // minimize duplication of work. It is important that during the update the | |
| 150 // children are considered in the non-decreasing order of their column, since | |
| 151 // this guarantees that all the parents would be up to date before an update | |
| 152 // of a child is done. | |
| 153 pending->clear(); | |
| 154 pending->resize(word_res->ratings->dimension(), SegSearchPending()); | |
| 155 | |
| 156 // Search the ratings matrix for the initial best path. | |
| 157 (*pending)[0].SetColumnClassified(); | |
| 158 UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res, pain_points, best_choice_bundle, | |
| 159 blamer_bundle); | |
| 160 } | |
| 161 | |
| 162 void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col, | |
| 163 std::vector<SegSearchPending> *pending, WERD_RES *word_res, | |
| 164 LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, | |
| 165 BlamerBundle *blamer_bundle) { | |
| 166 MATRIX *ratings = word_res->ratings; | |
| 167 ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == pending->size()); | |
| 168 ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == best_choice_bundle->beam.size()); | |
| 169 for (int col = starting_col; col < ratings->dimension(); ++col) { | |
| 170 if (!(*pending)[col].WorkToDo()) { | |
| 171 continue; | |
| 172 } | |
| 173 int first_row = col; | |
| 174 int last_row = std::min(ratings->dimension() - 1, col + ratings->bandwidth() - 1); | |
| 175 if ((*pending)[col].SingleRow() >= 0) { | |
| 176 first_row = last_row = (*pending)[col].SingleRow(); | |
| 177 } | |
| 178 if (segsearch_debug_level > 0) { | |
| 179 tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n", col, first_row, | |
| 180 last_row, (*pending)[col].IsRowJustClassified(INT32_MAX)); | |
| 181 } | |
| 182 // Iterate over the pending list for this column. | |
| 183 for (int row = first_row; row <= last_row; ++row) { | |
| 184 // Update language model state of this child+parent pair. | |
| 185 BLOB_CHOICE_LIST *current_node = ratings->get(col, row); | |
| 186 LanguageModelState *parent_node = col == 0 ? nullptr : best_choice_bundle->beam[col - 1]; | |
| 187 if (current_node != nullptr && | |
| 188 language_model_->UpdateState((*pending)[col].IsRowJustClassified(row), col, row, | |
| 189 current_node, parent_node, pain_points, word_res, | |
| 190 best_choice_bundle, blamer_bundle) && | |
| 191 row + 1 < ratings->dimension()) { | |
| 192 // Since the language model state of this entry changed, process all | |
| 193 // the child column. | |
| 194 (*pending)[row + 1].RevisitWholeColumn(); | |
| 195 if (segsearch_debug_level > 0) { | |
| 196 tprintf("Added child col=%d to pending\n", row + 1); | |
| 197 } | |
| 198 } // end if UpdateState. | |
| 199 } // end for row. | |
| 200 } // end for col. | |
| 201 if (best_choice_bundle->best_vse != nullptr) { | |
| 202 ASSERT_HOST(word_res->StatesAllValid()); | |
| 203 if (best_choice_bundle->best_vse->updated) { | |
| 204 pain_points->GenerateFromPath(rating_cert_scale, best_choice_bundle->best_vse, word_res); | |
| 205 if (!best_choice_bundle->fixpt.empty()) { | |
| 206 pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt, best_choice_bundle->best_vse, | |
| 207 word_res); | |
| 208 } | |
| 209 } | |
| 210 } | |
| 211 // The segsearch is completed. Reset all updated flags on all VSEs and reset | |
| 212 // all pendings. | |
| 213 for (unsigned col = 0; col < pending->size(); ++col) { | |
| 214 (*pending)[col].Clear(); | |
| 215 ViterbiStateEntry_IT vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries); | |
| 216 for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) { | |
| 217 vse_it.data()->updated = false; | |
| 218 } | |
| 219 } | |
| 220 } | |
| 221 | |
| 222 void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, | |
| 223 const char *pain_point_type, | |
| 224 std::vector<SegSearchPending> *pending, | |
| 225 WERD_RES *word_res, LMPainPoints *pain_points, | |
| 226 BlamerBundle *blamer_bundle) { | |
| 227 if (segsearch_debug_level > 0) { | |
| 228 tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n", pain_point_type, | |
| 229 pain_point_priority, pain_point.col, pain_point.row); | |
| 230 } | |
| 231 ASSERT_HOST(pain_points != nullptr); | |
| 232 MATRIX *ratings = word_res->ratings; | |
| 233 // Classify blob [pain_point.col pain_point.row] | |
| 234 if (!pain_point.Valid(*ratings)) { | |
| 235 ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col); | |
| 236 } | |
| 237 ASSERT_HOST(pain_point.Valid(*ratings)); | |
| 238 BLOB_CHOICE_LIST *classified = | |
| 239 classify_piece(word_res->seam_array, pain_point.col, pain_point.row, pain_point_type, | |
| 240 word_res->chopped_word, blamer_bundle); | |
| 241 BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row); | |
| 242 if (lst == nullptr) { | |
| 243 ratings->put(pain_point.col, pain_point.row, classified); | |
| 244 } else { | |
| 245 // We cannot delete old BLOB_CHOICEs, since they might contain | |
| 246 // ViterbiStateEntries that are parents of other "active" entries. | |
| 247 // Thus if the matrix cell already contains classifications we add | |
| 248 // the new ones to the beginning of the list. | |
| 249 BLOB_CHOICE_IT it(lst); | |
| 250 it.add_list_before(classified); | |
| 251 delete classified; // safe to delete, since empty after add_list_before() | |
| 252 classified = nullptr; | |
| 253 } | |
| 254 | |
| 255 if (segsearch_debug_level > 0) { | |
| 256 print_ratings_list("Updated ratings matrix with a new entry:", | |
| 257 ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset()); | |
| 258 ratings->print(getDict().getUnicharset()); | |
| 259 } | |
| 260 | |
| 261 // Insert initial "pain points" to join the newly classified blob | |
| 262 // with its left and right neighbors. | |
| 263 if (classified != nullptr && !classified->empty()) { | |
| 264 if (pain_point.col > 0) { | |
| 265 pain_points->GeneratePainPoint(pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0, true, | |
| 266 segsearch_max_char_wh_ratio, word_res); | |
| 267 } | |
| 268 if (pain_point.row + 1 < ratings->dimension()) { | |
| 269 pain_points->GeneratePainPoint(pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0, true, | |
| 270 segsearch_max_char_wh_ratio, word_res); | |
| 271 } | |
| 272 } | |
| 273 (*pending)[pain_point.col].SetBlobClassified(pain_point.row); | |
| 274 } | |
| 275 | |
| 276 // Resets enough of the results so that the Viterbi search is re-run. | |
| 277 // Needed when the n-gram model is enabled, as the multi-length comparison | |
| 278 // implementation will re-value existing paths to worse values. | |
| 279 void Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, | |
| 280 std::vector<SegSearchPending> &pending) { | |
| 281 // TODO(rays) More refactoring required here. | |
| 282 // Delete existing viterbi states. | |
| 283 for (auto &col : best_choice_bundle->beam) { | |
| 284 col->Clear(); | |
| 285 } | |
| 286 // Reset best_choice_bundle. | |
| 287 word_res->ClearWordChoices(); | |
| 288 best_choice_bundle->best_vse = nullptr; | |
| 289 // Clear out all existing pendings and add a new one for the first column. | |
| 290 pending[0].SetColumnClassified(); | |
| 291 for (auto &data : pending) { | |
| 292 data.Clear(); | |
| 293 } | |
| 294 } | |
| 295 | |
| 296 void Wordrec::InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, | |
| 297 BlamerBundle *blamer_bundle, std::string &blamer_debug) { | |
| 298 pain_points->Clear(); // Clear pain points heap. | |
| 299 blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings, getDict().WildcardID(), | |
| 300 wordrec_debug_blamer, blamer_debug, pain_points, | |
| 301 segsearch_max_char_wh_ratio, word_res); | |
| 302 } | |
| 303 | |
| 304 } // namespace tesseract |
