Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/ccstruct/werd.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/ccstruct/werd.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/**********************************************************************
+* File:        werd.cpp  (Formerly word.c)
+* Description: Code for the WERD class.
+* Author:      Ray Smith
+*
+* (C) Copyright 1991, Hewlett-Packard Ltd.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+**********************************************************************/
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+#include "werd.h"
+#include "linlsq.h"
+#include "helpers.h"
+namespace tesseract {
+#define FIRST_COLOUR ScrollView::RED       ///< first rainbow colour
+#define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour
+#define CHILD_COLOUR ScrollView::BROWN     ///< colour of children
+/**
+* WERD::WERD
+*
+* Constructor to build a WERD from a list of C_BLOBs.
+*   blob_list     The C_BLOBs (in word order) are not copied;
+*                 we take its elements and put them in our lists.
+*   blank_count   blanks in front of the word
+*   text          correct text, outlives this WERD
+*/
+WERD::WERD(C_BLOB_LIST *blob_list, uint8_t blank_count, const char *text)
+: blanks(blank_count), flags(0), script_id_(0), correct(text ? text : "") {
+C_BLOB_IT start_it = &cblobs;
+C_BLOB_IT rej_cblob_it = &rej_cblobs;
+C_OUTLINE_IT c_outline_it;
+int16_t inverted_vote = 0;
+int16_t non_inverted_vote = 0;
+// Move blob_list's elements into cblobs.
+start_it.add_list_after(blob_list);
+/*
+Set white on black flag for the WERD, moving any duff blobs onto the
+rej_cblobs list.
+First, walk the cblobs checking the inverse flag for each outline of each
+cblob. If a cblob has inconsistent flag settings for its different
+outlines, move the blob to the reject list. Otherwise, increment the
+appropriate w-on-b or b-on-w vote for the word.
+Now set the inversion flag for the WERD by maximum vote.
+Walk the blobs again, moving any blob whose inversion flag does not agree
+with the concencus onto the reject list.
+*/
+start_it.set_to_list(&cblobs);
+if (start_it.empty()) {
+return;
+}
+for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
+bool reject_blob = false;
+bool blob_inverted;
+c_outline_it.set_to_list(start_it.data()->out_list());
+blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
+for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob;
+c_outline_it.forward()) {
+reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
+}
+if (reject_blob) {
+rej_cblob_it.add_after_then_move(start_it.extract());
+} else {
+if (blob_inverted) {
+inverted_vote++;
+} else {
+non_inverted_vote++;
+}
+}
+}
+flags.set(W_INVERSE, (inverted_vote > non_inverted_vote));
+start_it.set_to_list(&cblobs);
+if (start_it.empty()) {
+return;
+}
+for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
+c_outline_it.set_to_list(start_it.data()->out_list());
+if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) {
+rej_cblob_it.add_after_then_move(start_it.extract());
+}
+}
+}
+/**
+* WERD::WERD
+*
+* Constructor to build a WERD from a list of C_BLOBs.
+* The C_BLOBs are not copied so the source list is emptied.
+*/
+WERD::WERD(C_BLOB_LIST *blob_list, ///< In word order
+WERD *clone)            ///< Source of flags
+: flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) {
+C_BLOB_IT start_it = blob_list; // iterator
+C_BLOB_IT end_it = blob_list;   // another
+while (!end_it.at_last()) {
+end_it.forward(); // move to last
+}
+cblobs.assign_to_sublist(&start_it, &end_it);
+// move to our list
+blanks = clone->blanks;
+//      fprintf(stderr,"Wrong constructor!!!!\n");
+}
+// Construct a WERD from a single_blob and clone the flags from this.
+// W_BOL and W_EOL flags are set according to the given values.
+WERD *WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob) {
+C_BLOB_LIST temp_blobs;
+C_BLOB_IT temp_it(&temp_blobs);
+temp_it.add_after_then_move(blob);
+WERD *blob_word = new WERD(&temp_blobs, this);
+blob_word->set_flag(W_BOL, bol);
+blob_word->set_flag(W_EOL, eol);
+return blob_word;
+}
+/**
+* WERD::bounding_box
+*
+* Return the bounding box of the WERD.
+* This is quite a mess to compute!
+* ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the
+* words on the row were re-sorted. The original words were built with reject
+* blobs included. The FUZZY SPACE flags were set accordingly. If ALL the
+* blobs in a word are rejected the BB for the word is nullptr, causing the sort
+* to screw up, leading to the erroneous possibility of the first word in a
+* row being marked as FUZZY space.
+*/
+TBOX WERD::bounding_box() const {
+return restricted_bounding_box(true, true);
+}
+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+TBOX box = true_bounding_box();
+int bottom = box.bottom();
+int top = box.top();
+// This is a read-only iteration of the rejected blobs.
+C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs));
+for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+TBOX dot_box = it.data()->bounding_box();
+if ((upper_dots || dot_box.bottom() <= top) && (lower_dots || dot_box.top() >= bottom)) {
+box += dot_box;
+}
+}
+return box;
+}
+// Returns the bounding box of only the good blobs.
+TBOX WERD::true_bounding_box() const {
+TBOX box; // box being built
+// This is a read-only iteration of the good blobs.
+C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs));
+for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+box += it.data()->bounding_box();
+}
+return box;
+}
+/**
+* WERD::move
+*
+* Reposition WERD by vector
+* NOTE!! REJECT CBLOBS ARE NOT MOVED
+*/
+void WERD::move(const ICOORD vec) {
+C_BLOB_IT cblob_it(&cblobs); // cblob iterator
+for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
+cblob_it.data()->move(vec);
+}
+}
+/**
+* WERD::join_on
+*
+* Join other word onto this one. Delete the old word.
+*/
+void WERD::join_on(WERD *other) {
+C_BLOB_IT blob_it(&cblobs);
+C_BLOB_IT src_it(&other->cblobs);
+C_BLOB_IT rej_cblob_it(&rej_cblobs);
+C_BLOB_IT src_rej_it(&other->rej_cblobs);
+while (!src_it.empty()) {
+blob_it.add_to_end(src_it.extract());
+src_it.forward();
+}
+while (!src_rej_it.empty()) {
+rej_cblob_it.add_to_end(src_rej_it.extract());
+src_rej_it.forward();
+}
+}
+/**
+* WERD::copy_on
+*
+* Copy blobs from other word onto this one.
+*/
+void WERD::copy_on(WERD *other) {
+bool reversed = other->bounding_box().left() < bounding_box().left();
+C_BLOB_IT c_blob_it(&cblobs);
+C_BLOB_LIST c_blobs;
+c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
+if (reversed) {
+c_blob_it.add_list_before(&c_blobs);
+} else {
+c_blob_it.move_to_last();
+c_blob_it.add_list_after(&c_blobs);
+}
+if (!other->rej_cblobs.empty()) {
+C_BLOB_IT rej_c_blob_it(&rej_cblobs);
+C_BLOB_LIST new_rej_c_blobs;
+new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
+if (reversed) {
+rej_c_blob_it.add_list_before(&new_rej_c_blobs);
+} else {
+rej_c_blob_it.move_to_last();
+rej_c_blob_it.add_list_after(&new_rej_c_blobs);
+}
+}
+}
+/**
+* WERD::print
+*
+* Display members
+*/
+void WERD::print() const {
+tprintf("Blanks= %d\n", blanks);
+bounding_box().print();
+tprintf("Flags = %lu = 0%lo\n", flags.to_ulong(), flags.to_ulong());
+tprintf("   W_SEGMENTED = %s\n", flags[W_SEGMENTED] ? "TRUE" : "FALSE");
+tprintf("   W_ITALIC = %s\n", flags[W_ITALIC] ? "TRUE" : "FALSE");
+tprintf("   W_BOL = %s\n", flags[W_BOL] ? "TRUE" : "FALSE");
+tprintf("   W_EOL = %s\n", flags[W_EOL] ? "TRUE" : "FALSE");
+tprintf("   W_NORMALIZED = %s\n", flags[W_NORMALIZED] ? "TRUE" : "FALSE");
+tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n", flags[W_SCRIPT_HAS_XHEIGHT] ? "TRUE" : "FALSE");
+tprintf("   W_SCRIPT_IS_LATIN = %s\n", flags[W_SCRIPT_IS_LATIN] ? "TRUE" : "FALSE");
+tprintf("   W_DONT_CHOP = %s\n", flags[W_DONT_CHOP] ? "TRUE" : "FALSE");
+tprintf("   W_REP_CHAR = %s\n", flags[W_REP_CHAR] ? "TRUE" : "FALSE");
+tprintf("   W_FUZZY_SP = %s\n", flags[W_FUZZY_SP] ? "TRUE" : "FALSE");
+tprintf("   W_FUZZY_NON = %s\n", flags[W_FUZZY_NON] ? "TRUE" : "FALSE");
+tprintf("Correct= %s\n", correct.c_str());
+tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
+tprintf("Script = %d\n", script_id_);
+}
+/**
+* WERD::plot
+*
+* Draw the WERD in the given colour.
+*/
+#ifndef GRAPHICS_DISABLED
+void WERD::plot(ScrollView *window, ScrollView::Color colour) {
+C_BLOB_IT it = &cblobs;
+for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+it.data()->plot(window, colour, colour);
+}
+plot_rej_blobs(window);
+}
+// Get the next color in the (looping) rainbow.
+ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
+auto next = static_cast<ScrollView::Color>(colour + 1);
+if (next >= LAST_COLOUR || next < FIRST_COLOUR) {
+next = FIRST_COLOUR;
+}
+return next;
+}
+/**
+* WERD::plot
+*
+* Draw the WERD in rainbow colours in window.
+*/
+void WERD::plot(ScrollView *window) {
+ScrollView::Color colour = FIRST_COLOUR;
+C_BLOB_IT it = &cblobs;
+for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+it.data()->plot(window, colour, CHILD_COLOUR);
+colour = NextColor(colour);
+}
+plot_rej_blobs(window);
+}
+/**
+* WERD::plot_rej_blobs
+*
+* Draw the WERD rejected blobs in window - ALWAYS GREY
+*/
+void WERD::plot_rej_blobs(ScrollView *window) {
+C_BLOB_IT it = &rej_cblobs;
+for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
+}
+}
+#endif // !GRAPHICS_DISABLED
+/**
+* WERD::shallow_copy()
+*
+* Make a shallow copy of a word
+*/
+WERD *WERD::shallow_copy() {
+WERD *new_word = new WERD;
+new_word->blanks = blanks;
+new_word->flags = flags;
+new_word->correct = correct;
+return new_word;
+}
+/**
+* WERD::operator=
+*
+* Assign a word, DEEP copying the blob list
+*/
+WERD &WERD::operator=(const WERD &source) {
+this->ELIST2_LINK::operator=(source);
+blanks = source.blanks;
+flags = source.flags;
+script_id_ = source.script_id_;
+correct = source.correct;
+cblobs.clear();
+cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
+rej_cblobs.clear();
+rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
+return *this;
+}
+/**
+*  word_comparator()
+*
+*  word comparator used to sort a word list so that words are in increasing
+*  order of left edge.
+*/
+int word_comparator(const void *word1p, const void *word2p) {
+const WERD *word1 = *reinterpret_cast<const WERD *const *>(word1p);
+const WERD *word2 = *reinterpret_cast<const WERD *const *>(word2p);
+return word1->bounding_box().left() - word2->bounding_box().left();
+}
+/**
+*  WERD::ConstructWerdWithNewBlobs()
+*
+* This method returns a new werd constructed using the blobs in the input
+* all_blobs list, which correspond to the blobs in this werd object. The
+* blobs used to construct the new word are consumed and removed from the
+* input all_blobs list.
+* Returns nullptr if the word couldn't be constructed.
+* Returns original blobs for which no matches were found in the output list
+* orphan_blobs (appends).
+*/
+WERD *WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs) {
+C_BLOB_LIST current_blob_list;
+C_BLOB_IT werd_blobs_it(&current_blob_list);
+// Add the word's c_blobs.
+werd_blobs_it.add_list_after(cblob_list());
+// New blob list. These contain the blobs which will form the new word.
+C_BLOB_LIST new_werd_blobs;
+C_BLOB_IT new_blobs_it(&new_werd_blobs);
+// not_found_blobs contains the list of current word's blobs for which a
+// corresponding blob wasn't found in the input all_blobs list.
+C_BLOB_LIST not_found_blobs;
+C_BLOB_IT not_found_it(&not_found_blobs);
+not_found_it.move_to_last();
+werd_blobs_it.move_to_first();
+for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) {
+C_BLOB *werd_blob = werd_blobs_it.extract();
+TBOX werd_blob_box = werd_blob->bounding_box();
+bool found = false;
+// Now find the corresponding blob for this blob in the all_blobs
+// list. For now, follow the inefficient method of pairwise
+// comparisons. Ideally, one can pre-bucket the blobs by row.
+C_BLOB_IT all_blobs_it(all_blobs);
+for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {
+C_BLOB *a_blob = all_blobs_it.data();
+// Compute the overlap of the two blobs. If major, a_blob should
+// be added to the new blobs list.
+TBOX a_blob_box = a_blob->bounding_box();
+if (a_blob_box.null_box()) {
+tprintf("Bounding box couldn't be ascertained\n");
+}
+if (werd_blob_box.contains(a_blob_box) || werd_blob_box.major_overlap(a_blob_box)) {
+// Old blobs are from minimal splits, therefore are expected to be
+// bigger. The new small blobs should cover a significant portion.
+// This is it.
+all_blobs_it.extract();
+new_blobs_it.add_after_then_move(a_blob);
+found = true;
+}
+}
+if (!found) {
+not_found_it.add_after_then_move(werd_blob);
+} else {
+delete werd_blob;
+}
+}
+// Iterate over all not found blobs. Some of them may be due to
+// under-segmentation (which is OK, since the corresponding blob is already
+// in the list in that case.
+not_found_it.move_to_first();
+for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {
+C_BLOB *not_found = not_found_it.data();
+TBOX not_found_box = not_found->bounding_box();
+C_BLOB_IT existing_blobs_it(new_blobs_it);
+for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
+existing_blobs_it.forward()) {
+C_BLOB *a_blob = existing_blobs_it.data();
+TBOX a_blob_box = a_blob->bounding_box();
+if ((not_found_box.major_overlap(a_blob_box) || a_blob_box.major_overlap(not_found_box)) &&
+not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
+// Already taken care of.
+delete not_found_it.extract();
+break;
+}
+}
+}
+if (orphan_blobs) {
+C_BLOB_IT orphan_blobs_it(orphan_blobs);
+orphan_blobs_it.move_to_last();
+orphan_blobs_it.add_list_after(&not_found_blobs);
+}
+// New blobs are ready. Create a new werd object with these.
+WERD *new_werd = nullptr;
+if (!new_werd_blobs.empty()) {
+new_werd = new WERD(&new_werd_blobs, this);
+} else {
+// Add the blobs back to this word so that it can be reused.
+C_BLOB_IT this_list_it(cblob_list());
+this_list_it.add_list_after(&not_found_blobs);
+}
+return new_werd;
+}
+// Removes noise from the word by moving small outlines to the rej_cblobs
+// list, based on the size_threshold.
+void WERD::CleanNoise(float size_threshold) {
+C_BLOB_IT blob_it(&cblobs);
+C_BLOB_IT rej_it(&rej_cblobs);
+for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+C_BLOB *blob = blob_it.data();
+C_OUTLINE_IT ol_it(blob->out_list());
+for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
+C_OUTLINE *outline = ol_it.data();
+TBOX ol_box = outline->bounding_box();
+int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
+if (ol_size < size_threshold) {
+// This outline is too small. Move it to a separate blob in the
+// reject blobs list.
+auto *rej_blob = new C_BLOB(ol_it.extract());
+rej_it.add_after_then_move(rej_blob);
+}
+}
+if (blob->out_list()->empty()) {
+delete blob_it.extract();
+}
+}
+}
+// Extracts all the noise outlines and stuffs the pointers into the given
+// vector of outlines. Afterwards, the outlines vector owns the pointers.
+void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {
+C_BLOB_IT rej_it(&rej_cblobs);
+for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
+C_BLOB *blob = rej_it.extract();
+C_OUTLINE_IT ol_it(blob->out_list());
+outlines->push_back(ol_it.extract());
+delete blob;
+}
+}
+// Adds the selected outlines to the indcated real blobs, and puts the rest
+// back in rej_cblobs where they came from. Where the target_blobs entry is
+// nullptr, a run of wanted outlines is put into a single new blob.
+// Ownership of the outlines is transferred back to the word. (Hence
+// vector and not PointerVector.)
+// Returns true if any new blob was added to the start of the word, which
+// suggests that it might need joining to the word before it, and likewise
+// sets make_next_word_fuzzy true if any new blob was added to the end.
+bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
+const std::vector<C_BLOB *> &target_blobs,
+const std::vector<C_OUTLINE *> &outlines,
+bool *make_next_word_fuzzy) {
+bool outline_added_to_start = false;
+if (make_next_word_fuzzy != nullptr) {
+*make_next_word_fuzzy = false;
+}
+C_BLOB_IT rej_it(&rej_cblobs);
+for (unsigned i = 0; i < outlines.size(); ++i) {
+C_OUTLINE *outline = outlines[i];
+if (outline == nullptr) {
+continue; // Already used it.
+}
+if (wanted[i]) {
+C_BLOB *target_blob = target_blobs[i];
+TBOX noise_box = outline->bounding_box();
+if (target_blob == nullptr) {
+target_blob = new C_BLOB(outline);
+// Need to find the insertion point.
+C_BLOB_IT blob_it(&cblobs);
+for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+C_BLOB *blob = blob_it.data();
+TBOX blob_box = blob->bounding_box();
+if (blob_box.left() > noise_box.left()) {
+if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
+// We might want to join this word to its predecessor.
+outline_added_to_start = true;
+}
+blob_it.add_before_stay_put(target_blob);
+break;
+}
+}
+if (blob_it.cycled_list()) {
+blob_it.add_to_end(target_blob);
+if (make_next_word_fuzzy != nullptr) {
+*make_next_word_fuzzy = true;
+}
+}
+// Add all consecutive wanted, but null-blob outlines to same blob.
+C_OUTLINE_IT ol_it(target_blob->out_list());
+while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) {
+++i;
+ol_it.add_to_end(outlines[i]);
+}
+} else {
+// Insert outline into this blob.
+C_OUTLINE_IT ol_it(target_blob->out_list());
+ol_it.add_to_end(outline);
+}
+} else {
+// Put back on noise list.
+rej_it.add_to_end(new C_BLOB(outline));
+}
+}
+return outline_added_to_start;
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/ccstruct/werd.cpp @ 2:b50eed0cc0ef upstream