Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/segsearch.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/segsearch.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,304 @@ +/////////////////////////////////////////////////////////////////////// +// File: segsearch.cpp +// Description: Segmentation search functions. +// Author: Daria Antonova +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include <cstdint> // for INT32_MAX +#include "blamer.h" // for BlamerBundle +#include "errcode.h" // for ASSERT_HOST +#include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi... +#include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry +#include "matrix.h" // for MATRIX_COORD, MATRIX +#include "pageres.h" // for WERD_RES +#include "params.h" // for BoolParam, IntParam, DoubleParam +#include "ratngs.h" // for BLOB_CHOICE_LIST, BLOB_CHOICE_IT +#include "tprintf.h" // for tprintf +#include "wordrec.h" // for Wordrec, SegSearchPending (ptr only) + +namespace tesseract { + +void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, + BlamerBundle *blamer_bundle) { + LMPainPoints pain_points(segsearch_max_pain_points, segsearch_max_char_wh_ratio, + assume_fixed_pitch_char_segment, &getDict(), segsearch_debug_level); + // Compute scaling factor that will help us recover blob outline length + // from classifier rating and certainty for the blob. + float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; + std::vector<SegSearchPending> pending; + InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle); + + if (!SegSearchDone(0)) { // find a better choice + if (chop_enable && word_res->chopped_word != nullptr) { + improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle, + &pain_points, &pending); + } + if (chop_debug) { + SEAM::PrintSeams("Final seam list:", word_res->seam_array); + } + + if (blamer_bundle != nullptr && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) { + blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer); + } + } + // Keep trying to find a better path by fixing the "pain points". + + MATRIX_COORD pain_point; + float pain_point_priority; + int num_futile_classifications = 0; + std::string blamer_debug; + while (wordrec_enable_assoc && + (!SegSearchDone(num_futile_classifications) || + (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()))) { + // Get the next valid "pain point". + bool found_nothing = true; + LMPainPointsType pp_type; + while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) != LM_PPTYPE_NUM) { + if (!pain_point.Valid(*word_res->ratings)) { + word_res->ratings->IncreaseBandSize(pain_point.row - pain_point.col + 1); + } + if (pain_point.Valid(*word_res->ratings) && + !word_res->ratings->Classified(pain_point.col, pain_point.row, getDict().WildcardID())) { + found_nothing = false; + break; + } + } + if (found_nothing) { + if (segsearch_debug_level > 0) { + tprintf("Pain points queue is empty\n"); + } + break; + } + ProcessSegSearchPainPoint(pain_point_priority, pain_point, + LMPainPoints::PainPointDescription(pp_type), &pending, word_res, + &pain_points, blamer_bundle); + + UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending, word_res, &pain_points, + best_choice_bundle, blamer_bundle); + if (!best_choice_bundle->updated) { + ++num_futile_classifications; + } + + if (segsearch_debug_level > 0) { + tprintf("num_futile_classifications %d\n", num_futile_classifications); + } + + best_choice_bundle->updated = false; // reset updated + + // See if it's time to terminate SegSearch or time for starting a guided + // search for the true path to find the blame for the incorrect best_choice. + if (SegSearchDone(num_futile_classifications) && blamer_bundle != nullptr && + blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) { + InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle, blamer_debug); + } + } // end while loop exploring alternative paths + if (blamer_bundle != nullptr) { + blamer_bundle->FinishSegSearch(word_res->best_choice, wordrec_debug_blamer, blamer_debug); + } + + if (segsearch_debug_level > 0) { + tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n", + language_model_->AcceptableChoiceFound()); + } +} + +// Setup and run just the initial segsearch on an established matrix, +// without doing any additional chopping or joining. +// (Internal factored version that can be used as part of the main SegSearch.) +void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, + std::vector<SegSearchPending> *pending, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) { + if (segsearch_debug_level > 0) { + tprintf("Starting SegSearch on ratings matrix%s:\n", + wordrec_enable_assoc ? " (with assoc)" : ""); + word_res->ratings->print(getDict().getUnicharset()); + } + + pain_points->GenerateInitial(word_res); + + // Compute scaling factor that will help us recover blob outline length + // from classifier rating and certainty for the blob. + float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; + + language_model_->InitForWord(prev_word_best_choice_, assume_fixed_pitch_char_segment, + segsearch_max_char_wh_ratio, rating_cert_scale); + + // Initialize blamer-related information: map character boxes recorded in + // blamer_bundle->norm_truth_word to the corresponding i,j indices in the + // ratings matrix. We expect this step to succeed, since when running the + // chopper we checked that the correct chops are present. + if (blamer_bundle != nullptr) { + blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word, wordrec_debug_blamer); + } + + // pending[col] tells whether there is update work to do to combine + // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *]. + // As the language model state is updated, pending entries are modified to + // minimize duplication of work. It is important that during the update the + // children are considered in the non-decreasing order of their column, since + // this guarantees that all the parents would be up to date before an update + // of a child is done. + pending->clear(); + pending->resize(word_res->ratings->dimension(), SegSearchPending()); + + // Search the ratings matrix for the initial best path. + (*pending)[0].SetColumnClassified(); + UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res, pain_points, best_choice_bundle, + blamer_bundle); +} + +void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col, + std::vector<SegSearchPending> *pending, WERD_RES *word_res, + LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, + BlamerBundle *blamer_bundle) { + MATRIX *ratings = word_res->ratings; + ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == pending->size()); + ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == best_choice_bundle->beam.size()); + for (int col = starting_col; col < ratings->dimension(); ++col) { + if (!(*pending)[col].WorkToDo()) { + continue; + } + int first_row = col; + int last_row = std::min(ratings->dimension() - 1, col + ratings->bandwidth() - 1); + if ((*pending)[col].SingleRow() >= 0) { + first_row = last_row = (*pending)[col].SingleRow(); + } + if (segsearch_debug_level > 0) { + tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n", col, first_row, + last_row, (*pending)[col].IsRowJustClassified(INT32_MAX)); + } + // Iterate over the pending list for this column. + for (int row = first_row; row <= last_row; ++row) { + // Update language model state of this child+parent pair. + BLOB_CHOICE_LIST *current_node = ratings->get(col, row); + LanguageModelState *parent_node = col == 0 ? nullptr : best_choice_bundle->beam[col - 1]; + if (current_node != nullptr && + language_model_->UpdateState((*pending)[col].IsRowJustClassified(row), col, row, + current_node, parent_node, pain_points, word_res, + best_choice_bundle, blamer_bundle) && + row + 1 < ratings->dimension()) { + // Since the language model state of this entry changed, process all + // the child column. + (*pending)[row + 1].RevisitWholeColumn(); + if (segsearch_debug_level > 0) { + tprintf("Added child col=%d to pending\n", row + 1); + } + } // end if UpdateState. + } // end for row. + } // end for col. + if (best_choice_bundle->best_vse != nullptr) { + ASSERT_HOST(word_res->StatesAllValid()); + if (best_choice_bundle->best_vse->updated) { + pain_points->GenerateFromPath(rating_cert_scale, best_choice_bundle->best_vse, word_res); + if (!best_choice_bundle->fixpt.empty()) { + pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt, best_choice_bundle->best_vse, + word_res); + } + } + } + // The segsearch is completed. Reset all updated flags on all VSEs and reset + // all pendings. + for (unsigned col = 0; col < pending->size(); ++col) { + (*pending)[col].Clear(); + ViterbiStateEntry_IT vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries); + for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) { + vse_it.data()->updated = false; + } + } +} + +void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, + const char *pain_point_type, + std::vector<SegSearchPending> *pending, + WERD_RES *word_res, LMPainPoints *pain_points, + BlamerBundle *blamer_bundle) { + if (segsearch_debug_level > 0) { + tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n", pain_point_type, + pain_point_priority, pain_point.col, pain_point.row); + } + ASSERT_HOST(pain_points != nullptr); + MATRIX *ratings = word_res->ratings; + // Classify blob [pain_point.col pain_point.row] + if (!pain_point.Valid(*ratings)) { + ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col); + } + ASSERT_HOST(pain_point.Valid(*ratings)); + BLOB_CHOICE_LIST *classified = + classify_piece(word_res->seam_array, pain_point.col, pain_point.row, pain_point_type, + word_res->chopped_word, blamer_bundle); + BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row); + if (lst == nullptr) { + ratings->put(pain_point.col, pain_point.row, classified); + } else { + // We cannot delete old BLOB_CHOICEs, since they might contain + // ViterbiStateEntries that are parents of other "active" entries. + // Thus if the matrix cell already contains classifications we add + // the new ones to the beginning of the list. + BLOB_CHOICE_IT it(lst); + it.add_list_before(classified); + delete classified; // safe to delete, since empty after add_list_before() + classified = nullptr; + } + + if (segsearch_debug_level > 0) { + print_ratings_list("Updated ratings matrix with a new entry:", + ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset()); + ratings->print(getDict().getUnicharset()); + } + + // Insert initial "pain points" to join the newly classified blob + // with its left and right neighbors. + if (classified != nullptr && !classified->empty()) { + if (pain_point.col > 0) { + pain_points->GeneratePainPoint(pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0, true, + segsearch_max_char_wh_ratio, word_res); + } + if (pain_point.row + 1 < ratings->dimension()) { + pain_points->GeneratePainPoint(pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0, true, + segsearch_max_char_wh_ratio, word_res); + } + } + (*pending)[pain_point.col].SetBlobClassified(pain_point.row); +} + +// Resets enough of the results so that the Viterbi search is re-run. +// Needed when the n-gram model is enabled, as the multi-length comparison +// implementation will re-value existing paths to worse values. +void Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, + std::vector<SegSearchPending> &pending) { + // TODO(rays) More refactoring required here. + // Delete existing viterbi states. + for (auto &col : best_choice_bundle->beam) { + col->Clear(); + } + // Reset best_choice_bundle. + word_res->ClearWordChoices(); + best_choice_bundle->best_vse = nullptr; + // Clear out all existing pendings and add a new one for the first column. + pending[0].SetColumnClassified(); + for (auto &data : pending) { + data.Clear(); + } +} + +void Wordrec::InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, + BlamerBundle *blamer_bundle, std::string &blamer_debug) { + pain_points->Clear(); // Clear pain points heap. + blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings, getDict().WildcardID(), + wordrec_debug_blamer, blamer_debug, pain_points, + segsearch_max_char_wh_ratio, word_res); +} + +} // namespace tesseract
