Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/chopper.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/chopper.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,617 @@ +/****************************************************************************** + * + * File: chopper.cpp (Formerly chopper.c) + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "blamer.h" // for BlamerBundle, IRR_CORRECT +#include "blobs.h" // for TPOINT, TBLOB, EDGEPT, TESSLINE, divisible_blob +#include "dict.h" // for Dict +#include "lm_pain_points.h" // for LMPainPoints +#include "lm_state.h" // for BestChoiceBundle +#include "matrix.h" // for MATRIX +#include "normalis.h" // for DENORM +#include "pageres.h" // for WERD_RES +#include "params.h" // for IntParam, BoolParam +#include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST (ptr ... +#include "rect.h" // for TBOX +#include "render.h" // for display_blob +#include "seam.h" // for SEAM +#include "split.h" // for remove_edgept +#include "stopper.h" // for DANGERR +#include "tprintf.h" // for tprintf +#include "wordrec.h" // for Wordrec, SegSearchPending (ptr only) + +namespace tesseract { + +// Even though the limit on the number of chunks may now be removed, keep +// the same limit for repeatable behavior, and it may be a speed advantage. +static const int kMaxNumChunks = 64; + +/*---------------------------------------------------------------------- + F u n c t i o n s +----------------------------------------------------------------------*/ + +/** + * @name check_blob + * + * @return true if blob has a non whole outline. + */ +static int check_blob(TBLOB *blob) { + TESSLINE *outline; + EDGEPT *edgept; + + for (outline = blob->outlines; outline != nullptr; outline = outline->next) { + edgept = outline->loop; + do { + if (edgept == nullptr) { + break; + } + edgept = edgept->next; + } while (edgept != outline->loop); + if (edgept == nullptr) { + return 1; + } + } + return 0; +} + +/** + * @name any_shared_split_points + * + * Return true if any of the splits share a point with this one. + */ +static int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) { + int length; + int index; + + length = seams.size(); + for (index = 0; index < length; index++) { + if (seam->SharesPosition(*seams[index])) { + return true; + } + } + return false; +} + +/** + * @name preserve_outline_tree + * + * Copy the list of outlines. + */ +static void preserve_outline(EDGEPT *start) { + EDGEPT *srcpt; + + if (start == nullptr) { + return; + } + srcpt = start; + do { + srcpt->runlength = 1; + srcpt = srcpt->next; + } while (srcpt != start); + srcpt->runlength = 2; +} + +static void preserve_outline_tree(TESSLINE *srcline) { + TESSLINE *outline; + + for (outline = srcline; outline != nullptr; outline = outline->next) { + preserve_outline(outline->loop); + } +} + +/** + * @name restore_outline_tree + * + * Copy the list of outlines. + */ +static EDGEPT *restore_outline(EDGEPT *start) { + EDGEPT *srcpt; + EDGEPT *real_start; + + if (start == nullptr) { + return nullptr; + } + srcpt = start; + do { + if (srcpt->runlength == 2) { + break; + } + srcpt = srcpt->next; + } while (srcpt != start); + real_start = srcpt; + do { + srcpt = srcpt->next; + if (srcpt->prev->runlength == 0) { + remove_edgept(srcpt->prev); + } + } while (srcpt != real_start); + return real_start; +} + +static void restore_outline_tree(TESSLINE *srcline) { + TESSLINE *outline; + + for (outline = srcline; outline != nullptr; outline = outline->next) { + outline->loop = restore_outline(outline->loop); + outline->start = outline->loop->pos; + } +} + +/********************************************************************** + * total_containment + * + * Check to see if one of these outlines is totally contained within + * the bounding box of the other. + **********************************************************************/ +static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) { + TBOX box1 = blob1->bounding_box(); + TBOX box2 = blob2->bounding_box(); + return box1.contains(box2) || box2.contains(box1); +} + +// Helper runs all the checks on a seam to make sure it is valid. +// Returns the seam if OK, otherwise deletes the seam and returns nullptr. +static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob, + TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) { + if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr || + total_containment(blob, other_blob) || check_blob(other_blob) || + !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) || + any_shared_split_points(seams, seam) || + !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) { + word->blobs.erase(word->blobs.begin() + blob_number + 1); + if (seam) { + seam->UndoSeam(blob, other_blob); + delete seam; + seam = nullptr; +#ifndef GRAPHICS_DISABLED + if (debug_level) { + if (debug_level > 2) { + display_blob(blob, ScrollView::RED); + } + tprintf("\n** seam being removed ** \n"); + } +#endif + } else { + delete other_blob; + } + return nullptr; + } + return seam; +} + +/** + * @name attempt_blob_chop + * + * Try to split the this blob after this one. Check to make sure that + * it was successful. + */ +SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, + const std::vector<SEAM *> &seams) { + if (repair_unchopped_blobs) { + preserve_outline_tree(blob->outlines); + } + TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ + // Insert it into the word. + word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); + + SEAM *seam = nullptr; + if (prioritize_division) { + TPOINT location; + if (divisible_blob(blob, italic_blob, &location)) { + seam = new SEAM(0.0f, location); + } + } + if (seam == nullptr) { + seam = pick_good_seam(blob); + } + if (chop_debug) { + if (seam != nullptr) { + seam->Print("Good seam picked="); + } else { + tprintf("\n** no seam picked *** \n"); + } + } + if (seam) { + seam->ApplySeam(italic_blob, blob, other_blob); + } + + seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); + if (seam == nullptr) { + if (repair_unchopped_blobs) { + restore_outline_tree(blob->outlines); + } + if (allow_blob_division && !prioritize_division) { + // If the blob can simply be divided into outlines, then do that. + TPOINT location; + if (divisible_blob(blob, italic_blob, &location)) { + other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ + word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); + seam = new SEAM(0.0f, location); + seam->ApplySeam(italic_blob, blob, other_blob); + seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); + } + } + } + if (seam != nullptr) { + // Make sure this seam doesn't get chopped again. + seam->Finalize(); + } + return seam; +} + +SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, + const std::vector<SEAM *> &seams) { + return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams); +} + +SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, + WERD_RES *word_res, unsigned *blob_number) { + TWERD *word = word_res->chopped_word; + for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) { + TBLOB *blob = word->blobs[*blob_number]; + TPOINT topleft, botright; + topleft.x = blob->bounding_box().left(); + topleft.y = blob->bounding_box().top(); + botright.x = blob->bounding_box().right(); + botright.y = blob->bounding_box().bottom(); + + TPOINT original_topleft, original_botright; + word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft); + word_res->denorm.DenormTransform(nullptr, botright, &original_botright); + + TBOX original_box = + TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y); + + bool almost_equal_box = false; + int num_overlap = 0; + for (auto &&boxe : boxes) { + if (original_box.overlap_fraction(boxe) > 0.125) { + num_overlap++; + } + if (original_box.almost_equal(boxe, 3)) { + almost_equal_box = true; + } + } + + TPOINT location; + if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) { + SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, word_res->seam_array); + if (seam != nullptr) { + return seam; + } + } + } + + *blob_number = UINT_MAX; + return nullptr; +} + +/** + * @name improve_one_blob + * + * Finds the best place to chop, based on the worst blob, fixpt, or next to + * a fragment, according to the input. Returns the SEAM corresponding to the + * chop point, if any is found, and the index in the ratings_matrix of the + * chopped blob. Note that blob_choices is just a copy of the pointers in the + * leading diagonal of the ratings MATRIX. + * Although the blob is chopped, the returned SEAM is yet to be inserted into + * word->seam_array and the resulting blobs are unclassified, so this function + * can be used by ApplyBox as well as during recognition. + */ +SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, + bool split_next_to_fragment, bool italic_blob, WERD_RES *word, + unsigned *blob_number) { + float rating_ceiling = FLT_MAX; + SEAM *seam = nullptr; + do { + auto blob = select_blob_to_split_from_fixpt(fixpt); + if (chop_debug) { + tprintf("blob_number from fixpt = %d\n", blob); + } + bool split_point_from_dict = (blob != -1); + if (split_point_from_dict) { + fixpt->clear(); + } else { + blob = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment); + } + if (chop_debug) { + tprintf("blob_number = %d\n", blob); + } + *blob_number = blob; + if (blob == -1) { + return nullptr; + } + + // TODO(rays) it may eventually help to allow italic_blob to be true, + seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob, word->seam_array); + if (seam != nullptr) { + break; // Success! + } + if (blob_choices[*blob_number] == nullptr) { + return nullptr; + } + if (!split_point_from_dict) { + // We chopped the worst rated blob, try something else next time. + rating_ceiling = blob_choices[*blob_number]->rating(); + } + } while (true); + return seam; +} + +/** + * @name chop_one_blob + * + * Start with the current one-blob word and its classification. Find + * the worst blobs and try to divide it up to improve the ratings. + * Used for testing chopper. + */ +SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes, + const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, + unsigned *blob_number) { + if (prioritize_division) { + return chop_overlapping_blob(boxes, true, word_res, blob_number); + } else { + return improve_one_blob(blob_choices, nullptr, false, true, word_res, blob_number); + } +} + +/** + * @name chop_word_main + * + * Classify the blobs in this word and permute the results. Find the + * worst blob in the word and chop it up. Continue this process until + * a good answer has been found or all the blobs have been chopped up + * enough. The results are returned in the WERD_RES. + */ +void Wordrec::chop_word_main(WERD_RES *word) { + int num_blobs = word->chopped_word->NumBlobs(); + if (word->ratings == nullptr) { + word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks); + } + if (word->ratings->get(0, 0) == nullptr) { + // Run initial classification. + for (int b = 0; b < num_blobs; ++b) { + BLOB_CHOICE_LIST *choices = classify_piece( + word->seam_array, b, b, "Initial:", word->chopped_word, word->blamer_bundle); + word->ratings->put(b, b, choices); + } + } else { + // Blobs have been pre-classified. Set matrix cell for all blob choices + for (int col = 0; col < word->ratings->dimension(); ++col) { + for (int row = col; + row < word->ratings->dimension() && row < col + word->ratings->bandwidth(); ++row) { + BLOB_CHOICE_LIST *choices = word->ratings->get(col, row); + if (choices != nullptr) { + BLOB_CHOICE_IT bc_it(choices); + for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { + bc_it.data()->set_matrix_cell(col, row); + } + } + } + } + } + + // Run Segmentation Search. + BestChoiceBundle best_choice_bundle(word->ratings->dimension()); + SegSearch(word, &best_choice_bundle, word->blamer_bundle); + + if (word->best_choice == nullptr) { + // SegSearch found no valid paths, so just use the leading diagonal. + word->FakeWordFromRatings(TOP_CHOICE_PERM); + } + word->RebuildBestState(); + // If we finished without a hyphen at the end of the word, let the next word + // be found in the dictionary. + if (word->word->flag(W_EOL) && !getDict().has_hyphen_end(*word->best_choice)) { + getDict().reset_hyphen_vars(true); + } + + if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) { + CallFillLattice(*word->ratings, word->best_choices, *word->uch_set, word->blamer_bundle); + } + if (wordrec_debug_level > 0) { + tprintf("Final Ratings Matrix:\n"); + word->ratings->print(getDict().getUnicharset()); + } + word->FilterWordChoices(getDict().stopper_debug_level); +} + +/** + * @name improve_by_chopping + * + * Repeatedly chops the worst blob, classifying the new blobs fixing up all + * the data, and incrementally runs the segmentation search until a good word + * is found, or no more chops can be found. + */ +void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, + LMPainPoints *pain_points, + std::vector<SegSearchPending> *pending) { + unsigned blob_number; + do { // improvement loop. + // Make a simple vector of BLOB_CHOICEs to make it easy to pick which + // one to chop. + std::vector<BLOB_CHOICE *> blob_choices; + int num_blobs = word->ratings->dimension(); + for (int i = 0; i < num_blobs; ++i) { + BLOB_CHOICE_LIST *choices = word->ratings->get(i, i); + if (choices == nullptr || choices->empty()) { + blob_choices.push_back(nullptr); + } else { + BLOB_CHOICE_IT bc_it(choices); + blob_choices.push_back(bc_it.data()); + } + } + SEAM *seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, + &blob_number); + if (seam == nullptr) { + break; + } + // A chop has been made. We have to correct all the data structures to + // take into account the extra bottom-level blob. + // Put the seam into the seam_array and correct everything else on the + // word: ratings matrix (including matrix location in the BLOB_CHOICES), + // states in WERD_CHOICEs, and blob widths. + word->InsertSeam(blob_number, seam); + // Insert a new entry in the beam array. + best_choice_bundle->beam.insert(best_choice_bundle->beam.begin() + blob_number, new LanguageModelState); + // Fixpts are outdated, but will get recalculated. + best_choice_bundle->fixpt.clear(); + // Remap existing pain points. + pain_points->RemapForSplit(blob_number); + // Insert a new pending at the chop point. + pending->insert(pending->begin() + blob_number, SegSearchPending()); + + // Classify the two newly created blobs using ProcessSegSearchPainPoint, + // as that updates the pending correctly and adds new pain points. + MATRIX_COORD pain_point(blob_number, blob_number); + ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); + pain_point.col = blob_number + 1; + pain_point.row = blob_number + 1; + ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); + if (language_model_->language_model_ngram_on) { + // N-gram evaluation depends on the number of blobs in a chunk, so we + // have to re-evaluate everything in the word. + ResetNGramSearch(word, best_choice_bundle, *pending); + blob_number = 0; + } + // Run language model incrementally. (Except with the n-gram model on.) + UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, + best_choice_bundle, blamer_bundle); + } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); + + // If after running only the chopper best_choice is incorrect and no blame + // has been yet set, blame the classifier if best_choice is classifier's + // top choice and is a dictionary word (i.e. language model could not have + // helped). Otherwise blame the tradeoff between the classifier and + // the old language model (permuters). + if (word->blamer_bundle != nullptr && + word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && + !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { + bool valid_permuter = word->best_choice != nullptr && + Dict::valid_word_permuter(word->best_choice->permuter(), false); + word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, + wordrec_debug_blamer); + } +} + +/********************************************************************** + * select_blob_to_split + * + * These are the results of the last classification. Find a likely + * place to apply splits. If none, return -1. + **********************************************************************/ +int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, + float rating_ceiling, bool split_next_to_fragment) { + BLOB_CHOICE *blob_choice; + float worst = -FLT_MAX; + int worst_index = -1; + float worst_near_fragment = -FLT_MAX; + int worst_index_near_fragment = -1; + std::vector<const CHAR_FRAGMENT *> fragments; + + if (chop_debug) { + if (rating_ceiling < FLT_MAX) { + tprintf("rating_ceiling = %8.4f\n", rating_ceiling); + } else { + tprintf("rating_ceiling = No Limit\n"); + } + } + + if (split_next_to_fragment && blob_choices.size() > 0) { + fragments.resize(blob_choices.size()); + if (blob_choices[0] != nullptr) { + fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id()); + } else { + fragments[0] = nullptr; + } + } + + for (unsigned x = 0; x < blob_choices.size(); ++x) { + if (blob_choices[x] == nullptr) { + return x; + } else { + blob_choice = blob_choices[x]; + // Populate fragments for the following position. + if (split_next_to_fragment && x + 1 < blob_choices.size()) { + if (blob_choices[x + 1] != nullptr) { + fragments[x + 1] = + getDict().getUnicharset().get_fragment(blob_choices[x + 1]->unichar_id()); + } else { + fragments[x + 1] = nullptr; + } + } + if (blob_choice->rating() < rating_ceiling && + blob_choice->certainty() < tessedit_certainty_threshold) { + // Update worst and worst_index. + if (blob_choice->rating() > worst) { + worst_index = x; + worst = blob_choice->rating(); + } + if (split_next_to_fragment) { + // Update worst_near_fragment and worst_index_near_fragment. + bool expand_following_fragment = + (x + 1 < blob_choices.size() && fragments[x + 1] != nullptr && + !fragments[x + 1]->is_beginning()); + bool expand_preceding_fragment = + (x > 0 && fragments[x - 1] != nullptr && !fragments[x - 1]->is_ending()); + if ((expand_following_fragment || expand_preceding_fragment) && + blob_choice->rating() > worst_near_fragment) { + worst_index_near_fragment = x; + worst_near_fragment = blob_choice->rating(); + if (chop_debug) { + tprintf( + "worst_index_near_fragment=%d" + " expand_following_fragment=%d" + " expand_preceding_fragment=%d\n", + worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment); + } + } + } + } + } + } + // TODO(daria): maybe a threshold of badness for + // worst_near_fragment would be useful. + return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index; +} + +/********************************************************************** + * select_blob_to_split_from_fixpt + * + * Given the fix point from a dictionary search, if there is a single + * dangerous blob that maps to multiple characters, return that blob + * index as a place we need to split. If none, return -1. + **********************************************************************/ +int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) { + if (!fixpt) { + return -1; + } + for (auto &i : *fixpt) { + if (i.begin + 1 == i.end && i.dangerous && i.correct_is_ngram) { + return i.begin; + } + } + return -1; +} + +} // namespace tesseract
