Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/werd.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/werd.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,584 @@ +/********************************************************************** + * File: werd.cpp (Formerly word.c) + * Description: Code for the WERD class. + * Author: Ray Smith + * + * (C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "werd.h" + +#include "linlsq.h" + +#include "helpers.h" + +namespace tesseract { + +#define FIRST_COLOUR ScrollView::RED ///< first rainbow colour +#define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour +#define CHILD_COLOUR ScrollView::BROWN ///< colour of children + +/** + * WERD::WERD + * + * Constructor to build a WERD from a list of C_BLOBs. + * blob_list The C_BLOBs (in word order) are not copied; + * we take its elements and put them in our lists. + * blank_count blanks in front of the word + * text correct text, outlives this WERD + */ +WERD::WERD(C_BLOB_LIST *blob_list, uint8_t blank_count, const char *text) + : blanks(blank_count), flags(0), script_id_(0), correct(text ? text : "") { + C_BLOB_IT start_it = &cblobs; + C_BLOB_IT rej_cblob_it = &rej_cblobs; + C_OUTLINE_IT c_outline_it; + int16_t inverted_vote = 0; + int16_t non_inverted_vote = 0; + + // Move blob_list's elements into cblobs. + start_it.add_list_after(blob_list); + + /* + Set white on black flag for the WERD, moving any duff blobs onto the + rej_cblobs list. + First, walk the cblobs checking the inverse flag for each outline of each + cblob. If a cblob has inconsistent flag settings for its different + outlines, move the blob to the reject list. Otherwise, increment the + appropriate w-on-b or b-on-w vote for the word. + + Now set the inversion flag for the WERD by maximum vote. + + Walk the blobs again, moving any blob whose inversion flag does not agree + with the concencus onto the reject list. +*/ + start_it.set_to_list(&cblobs); + if (start_it.empty()) { + return; + } + for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { + bool reject_blob = false; + bool blob_inverted; + + c_outline_it.set_to_list(start_it.data()->out_list()); + blob_inverted = c_outline_it.data()->flag(COUT_INVERSE); + for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob; + c_outline_it.forward()) { + reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted; + } + if (reject_blob) { + rej_cblob_it.add_after_then_move(start_it.extract()); + } else { + if (blob_inverted) { + inverted_vote++; + } else { + non_inverted_vote++; + } + } + } + + flags.set(W_INVERSE, (inverted_vote > non_inverted_vote)); + + start_it.set_to_list(&cblobs); + if (start_it.empty()) { + return; + } + for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { + c_outline_it.set_to_list(start_it.data()->out_list()); + if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) { + rej_cblob_it.add_after_then_move(start_it.extract()); + } + } +} + +/** + * WERD::WERD + * + * Constructor to build a WERD from a list of C_BLOBs. + * The C_BLOBs are not copied so the source list is emptied. + */ + +WERD::WERD(C_BLOB_LIST *blob_list, ///< In word order + WERD *clone) ///< Source of flags + : flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) { + C_BLOB_IT start_it = blob_list; // iterator + C_BLOB_IT end_it = blob_list; // another + + while (!end_it.at_last()) { + end_it.forward(); // move to last + } + cblobs.assign_to_sublist(&start_it, &end_it); + // move to our list + blanks = clone->blanks; + // fprintf(stderr,"Wrong constructor!!!!\n"); +} + +// Construct a WERD from a single_blob and clone the flags from this. +// W_BOL and W_EOL flags are set according to the given values. +WERD *WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob) { + C_BLOB_LIST temp_blobs; + C_BLOB_IT temp_it(&temp_blobs); + temp_it.add_after_then_move(blob); + WERD *blob_word = new WERD(&temp_blobs, this); + blob_word->set_flag(W_BOL, bol); + blob_word->set_flag(W_EOL, eol); + return blob_word; +} + +/** + * WERD::bounding_box + * + * Return the bounding box of the WERD. + * This is quite a mess to compute! + * ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the + * words on the row were re-sorted. The original words were built with reject + * blobs included. The FUZZY SPACE flags were set accordingly. If ALL the + * blobs in a word are rejected the BB for the word is nullptr, causing the sort + * to screw up, leading to the erroneous possibility of the first word in a + * row being marked as FUZZY space. + */ + +TBOX WERD::bounding_box() const { + return restricted_bounding_box(true, true); +} + +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box = true_bounding_box(); + int bottom = box.bottom(); + int top = box.top(); + // This is a read-only iteration of the rejected blobs. + C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + TBOX dot_box = it.data()->bounding_box(); + if ((upper_dots || dot_box.bottom() <= top) && (lower_dots || dot_box.top() >= bottom)) { + box += dot_box; + } + } + return box; +} + +// Returns the bounding box of only the good blobs. +TBOX WERD::true_bounding_box() const { + TBOX box; // box being built + // This is a read-only iteration of the good blobs. + C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->bounding_box(); + } + return box; +} + +/** + * WERD::move + * + * Reposition WERD by vector + * NOTE!! REJECT CBLOBS ARE NOT MOVED + */ + +void WERD::move(const ICOORD vec) { + C_BLOB_IT cblob_it(&cblobs); // cblob iterator + + for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { + cblob_it.data()->move(vec); + } +} + +/** + * WERD::join_on + * + * Join other word onto this one. Delete the old word. + */ + +void WERD::join_on(WERD *other) { + C_BLOB_IT blob_it(&cblobs); + C_BLOB_IT src_it(&other->cblobs); + C_BLOB_IT rej_cblob_it(&rej_cblobs); + C_BLOB_IT src_rej_it(&other->rej_cblobs); + + while (!src_it.empty()) { + blob_it.add_to_end(src_it.extract()); + src_it.forward(); + } + while (!src_rej_it.empty()) { + rej_cblob_it.add_to_end(src_rej_it.extract()); + src_rej_it.forward(); + } +} + +/** + * WERD::copy_on + * + * Copy blobs from other word onto this one. + */ + +void WERD::copy_on(WERD *other) { + bool reversed = other->bounding_box().left() < bounding_box().left(); + C_BLOB_IT c_blob_it(&cblobs); + C_BLOB_LIST c_blobs; + + c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy); + if (reversed) { + c_blob_it.add_list_before(&c_blobs); + } else { + c_blob_it.move_to_last(); + c_blob_it.add_list_after(&c_blobs); + } + if (!other->rej_cblobs.empty()) { + C_BLOB_IT rej_c_blob_it(&rej_cblobs); + C_BLOB_LIST new_rej_c_blobs; + + new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy); + if (reversed) { + rej_c_blob_it.add_list_before(&new_rej_c_blobs); + } else { + rej_c_blob_it.move_to_last(); + rej_c_blob_it.add_list_after(&new_rej_c_blobs); + } + } +} + +/** + * WERD::print + * + * Display members + */ + +void WERD::print() const { + tprintf("Blanks= %d\n", blanks); + bounding_box().print(); + tprintf("Flags = %lu = 0%lo\n", flags.to_ulong(), flags.to_ulong()); + tprintf(" W_SEGMENTED = %s\n", flags[W_SEGMENTED] ? "TRUE" : "FALSE"); + tprintf(" W_ITALIC = %s\n", flags[W_ITALIC] ? "TRUE" : "FALSE"); + tprintf(" W_BOL = %s\n", flags[W_BOL] ? "TRUE" : "FALSE"); + tprintf(" W_EOL = %s\n", flags[W_EOL] ? "TRUE" : "FALSE"); + tprintf(" W_NORMALIZED = %s\n", flags[W_NORMALIZED] ? "TRUE" : "FALSE"); + tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", flags[W_SCRIPT_HAS_XHEIGHT] ? "TRUE" : "FALSE"); + tprintf(" W_SCRIPT_IS_LATIN = %s\n", flags[W_SCRIPT_IS_LATIN] ? "TRUE" : "FALSE"); + tprintf(" W_DONT_CHOP = %s\n", flags[W_DONT_CHOP] ? "TRUE" : "FALSE"); + tprintf(" W_REP_CHAR = %s\n", flags[W_REP_CHAR] ? "TRUE" : "FALSE"); + tprintf(" W_FUZZY_SP = %s\n", flags[W_FUZZY_SP] ? "TRUE" : "FALSE"); + tprintf(" W_FUZZY_NON = %s\n", flags[W_FUZZY_NON] ? "TRUE" : "FALSE"); + tprintf("Correct= %s\n", correct.c_str()); + tprintf("Rejected cblob count = %d\n", rej_cblobs.length()); + tprintf("Script = %d\n", script_id_); +} + +/** + * WERD::plot + * + * Draw the WERD in the given colour. + */ + +#ifndef GRAPHICS_DISABLED +void WERD::plot(ScrollView *window, ScrollView::Color colour) { + C_BLOB_IT it = &cblobs; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + it.data()->plot(window, colour, colour); + } + plot_rej_blobs(window); +} + +// Get the next color in the (looping) rainbow. +ScrollView::Color WERD::NextColor(ScrollView::Color colour) { + auto next = static_cast<ScrollView::Color>(colour + 1); + if (next >= LAST_COLOUR || next < FIRST_COLOUR) { + next = FIRST_COLOUR; + } + return next; +} + +/** + * WERD::plot + * + * Draw the WERD in rainbow colours in window. + */ + +void WERD::plot(ScrollView *window) { + ScrollView::Color colour = FIRST_COLOUR; + C_BLOB_IT it = &cblobs; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + it.data()->plot(window, colour, CHILD_COLOUR); + colour = NextColor(colour); + } + plot_rej_blobs(window); +} + +/** + * WERD::plot_rej_blobs + * + * Draw the WERD rejected blobs in window - ALWAYS GREY + */ + +void WERD::plot_rej_blobs(ScrollView *window) { + C_BLOB_IT it = &rej_cblobs; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + it.data()->plot(window, ScrollView::GREY, ScrollView::GREY); + } +} +#endif // !GRAPHICS_DISABLED + +/** + * WERD::shallow_copy() + * + * Make a shallow copy of a word + */ + +WERD *WERD::shallow_copy() { + WERD *new_word = new WERD; + + new_word->blanks = blanks; + new_word->flags = flags; + new_word->correct = correct; + return new_word; +} + +/** + * WERD::operator= + * + * Assign a word, DEEP copying the blob list + */ + +WERD &WERD::operator=(const WERD &source) { + this->ELIST2_LINK::operator=(source); + blanks = source.blanks; + flags = source.flags; + script_id_ = source.script_id_; + correct = source.correct; + cblobs.clear(); + cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy); + rej_cblobs.clear(); + rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy); + return *this; +} + +/** + * word_comparator() + * + * word comparator used to sort a word list so that words are in increasing + * order of left edge. + */ + +int word_comparator(const void *word1p, const void *word2p) { + const WERD *word1 = *reinterpret_cast<const WERD *const *>(word1p); + const WERD *word2 = *reinterpret_cast<const WERD *const *>(word2p); + return word1->bounding_box().left() - word2->bounding_box().left(); +} + +/** + * WERD::ConstructWerdWithNewBlobs() + * + * This method returns a new werd constructed using the blobs in the input + * all_blobs list, which correspond to the blobs in this werd object. The + * blobs used to construct the new word are consumed and removed from the + * input all_blobs list. + * Returns nullptr if the word couldn't be constructed. + * Returns original blobs for which no matches were found in the output list + * orphan_blobs (appends). + */ + +WERD *WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs) { + C_BLOB_LIST current_blob_list; + C_BLOB_IT werd_blobs_it(¤t_blob_list); + // Add the word's c_blobs. + werd_blobs_it.add_list_after(cblob_list()); + + // New blob list. These contain the blobs which will form the new word. + C_BLOB_LIST new_werd_blobs; + C_BLOB_IT new_blobs_it(&new_werd_blobs); + + // not_found_blobs contains the list of current word's blobs for which a + // corresponding blob wasn't found in the input all_blobs list. + C_BLOB_LIST not_found_blobs; + C_BLOB_IT not_found_it(¬_found_blobs); + not_found_it.move_to_last(); + + werd_blobs_it.move_to_first(); + for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) { + C_BLOB *werd_blob = werd_blobs_it.extract(); + TBOX werd_blob_box = werd_blob->bounding_box(); + bool found = false; + // Now find the corresponding blob for this blob in the all_blobs + // list. For now, follow the inefficient method of pairwise + // comparisons. Ideally, one can pre-bucket the blobs by row. + C_BLOB_IT all_blobs_it(all_blobs); + for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) { + C_BLOB *a_blob = all_blobs_it.data(); + // Compute the overlap of the two blobs. If major, a_blob should + // be added to the new blobs list. + TBOX a_blob_box = a_blob->bounding_box(); + if (a_blob_box.null_box()) { + tprintf("Bounding box couldn't be ascertained\n"); + } + if (werd_blob_box.contains(a_blob_box) || werd_blob_box.major_overlap(a_blob_box)) { + // Old blobs are from minimal splits, therefore are expected to be + // bigger. The new small blobs should cover a significant portion. + // This is it. + all_blobs_it.extract(); + new_blobs_it.add_after_then_move(a_blob); + found = true; + } + } + if (!found) { + not_found_it.add_after_then_move(werd_blob); + } else { + delete werd_blob; + } + } + // Iterate over all not found blobs. Some of them may be due to + // under-segmentation (which is OK, since the corresponding blob is already + // in the list in that case. + not_found_it.move_to_first(); + for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) { + C_BLOB *not_found = not_found_it.data(); + TBOX not_found_box = not_found->bounding_box(); + C_BLOB_IT existing_blobs_it(new_blobs_it); + for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list(); + existing_blobs_it.forward()) { + C_BLOB *a_blob = existing_blobs_it.data(); + TBOX a_blob_box = a_blob->bounding_box(); + if ((not_found_box.major_overlap(a_blob_box) || a_blob_box.major_overlap(not_found_box)) && + not_found_box.y_overlap_fraction(a_blob_box) > 0.8) { + // Already taken care of. + delete not_found_it.extract(); + break; + } + } + } + if (orphan_blobs) { + C_BLOB_IT orphan_blobs_it(orphan_blobs); + orphan_blobs_it.move_to_last(); + orphan_blobs_it.add_list_after(¬_found_blobs); + } + + // New blobs are ready. Create a new werd object with these. + WERD *new_werd = nullptr; + if (!new_werd_blobs.empty()) { + new_werd = new WERD(&new_werd_blobs, this); + } else { + // Add the blobs back to this word so that it can be reused. + C_BLOB_IT this_list_it(cblob_list()); + this_list_it.add_list_after(¬_found_blobs); + } + return new_werd; +} + +// Removes noise from the word by moving small outlines to the rej_cblobs +// list, based on the size_threshold. +void WERD::CleanNoise(float size_threshold) { + C_BLOB_IT blob_it(&cblobs); + C_BLOB_IT rej_it(&rej_cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB *blob = blob_it.data(); + C_OUTLINE_IT ol_it(blob->out_list()); + for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { + C_OUTLINE *outline = ol_it.data(); + TBOX ol_box = outline->bounding_box(); + int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); + if (ol_size < size_threshold) { + // This outline is too small. Move it to a separate blob in the + // reject blobs list. + auto *rej_blob = new C_BLOB(ol_it.extract()); + rej_it.add_after_then_move(rej_blob); + } + } + if (blob->out_list()->empty()) { + delete blob_it.extract(); + } + } +} + +// Extracts all the noise outlines and stuffs the pointers into the given +// vector of outlines. Afterwards, the outlines vector owns the pointers. +void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) { + C_BLOB_IT rej_it(&rej_cblobs); + for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { + C_BLOB *blob = rej_it.extract(); + C_OUTLINE_IT ol_it(blob->out_list()); + outlines->push_back(ol_it.extract()); + delete blob; + } +} + +// Adds the selected outlines to the indcated real blobs, and puts the rest +// back in rej_cblobs where they came from. Where the target_blobs entry is +// nullptr, a run of wanted outlines is put into a single new blob. +// Ownership of the outlines is transferred back to the word. (Hence +// vector and not PointerVector.) +// Returns true if any new blob was added to the start of the word, which +// suggests that it might need joining to the word before it, and likewise +// sets make_next_word_fuzzy true if any new blob was added to the end. +bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted, + const std::vector<C_BLOB *> &target_blobs, + const std::vector<C_OUTLINE *> &outlines, + bool *make_next_word_fuzzy) { + bool outline_added_to_start = false; + if (make_next_word_fuzzy != nullptr) { + *make_next_word_fuzzy = false; + } + C_BLOB_IT rej_it(&rej_cblobs); + for (unsigned i = 0; i < outlines.size(); ++i) { + C_OUTLINE *outline = outlines[i]; + if (outline == nullptr) { + continue; // Already used it. + } + if (wanted[i]) { + C_BLOB *target_blob = target_blobs[i]; + TBOX noise_box = outline->bounding_box(); + if (target_blob == nullptr) { + target_blob = new C_BLOB(outline); + // Need to find the insertion point. + C_BLOB_IT blob_it(&cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB *blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + if (blob_box.left() > noise_box.left()) { + if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { + // We might want to join this word to its predecessor. + outline_added_to_start = true; + } + blob_it.add_before_stay_put(target_blob); + break; + } + } + if (blob_it.cycled_list()) { + blob_it.add_to_end(target_blob); + if (make_next_word_fuzzy != nullptr) { + *make_next_word_fuzzy = true; + } + } + // Add all consecutive wanted, but null-blob outlines to same blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) { + ++i; + ol_it.add_to_end(outlines[i]); + } + } else { + // Insert outline into this blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + ol_it.add_to_end(outline); + } + } else { + // Put back on noise list. + rej_it.add_to_end(new C_BLOB(outline)); + } + } + return outline_added_to_start; +} + +} // namespace tesseract
