Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/boxword.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/boxword.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,211 @@ +/////////////////////////////////////////////////////////////////////// +// File: boxword.cpp +// Description: Class to represent the bounding boxes of the output. +// Author: Ray Smith +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "boxword.h" +#include "blobs.h" +#include "host.h" // for NearlyEqual +#include "normalis.h" +#include "ocrblock.h" +#include "pageres.h" + +namespace tesseract { + +// Clip output boxes to input blob boxes for bounds that are within this +// tolerance. Otherwise, the blob may be chopped and we have to just use +// the word bounding box. +const int kBoxClipTolerance = 2; + +BoxWord::BoxWord() : length_(0) {} + +BoxWord::BoxWord(const BoxWord &src) { + CopyFrom(src); +} + +BoxWord &BoxWord::operator=(const BoxWord &src) { + CopyFrom(src); + return *this; +} + +void BoxWord::CopyFrom(const BoxWord &src) { + bbox_ = src.bbox_; + length_ = src.length_; + boxes_.clear(); + boxes_.reserve(length_); + for (unsigned i = 0; i < length_; ++i) { + boxes_.push_back(src.boxes_[i]); + } +} + +// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to +// switch back to original image coordinates. +BoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) { + auto *boxword = new BoxWord(); + // Count the blobs. + boxword->length_ = tessword->NumBlobs(); + // Allocate memory. + boxword->boxes_.reserve(boxword->length_); + + for (unsigned b = 0; b < boxword->length_; ++b) { + TBLOB *tblob = tessword->blobs[b]; + TBOX blob_box; + for (TESSLINE *outline = tblob->outlines; outline != nullptr; + outline = outline->next) { + EDGEPT *edgept = outline->loop; + // Iterate over the edges. + do { + if (!edgept->IsHidden() || !edgept->prev->IsHidden()) { + ICOORD pos(edgept->pos.x, edgept->pos.y); + TPOINT denormed; + tblob->denorm().DenormTransform(nullptr, edgept->pos, &denormed); + pos.set_x(denormed.x); + pos.set_y(denormed.y); + TBOX pt_box(pos, pos); + blob_box += pt_box; + } + edgept = edgept->next; + } while (edgept != outline->loop); + } + boxword->boxes_.push_back(blob_box); + } + boxword->ComputeBoundingBox(); + return boxword; +} + +// Clean up the bounding boxes from the polygonal approximation by +// expanding slightly, then clipping to the blobs from the original_word +// that overlap. If not null, the block provides the inverse rotation. +void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) { + for (unsigned i = 0; i < length_; ++i) { + TBOX box = boxes_[i]; + // Expand by a single pixel, as the poly approximation error is 1 pixel. + box = + TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1); + // Now find the original box that matches. + TBOX original_box; + C_BLOB_IT b_it(original_word->cblob_list()); + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + TBOX blob_box = b_it.data()->bounding_box(); + if (block != nullptr) { + blob_box.rotate(block->re_rotation()); + } + if (blob_box.major_overlap(box)) { + original_box += blob_box; + } + } + if (!original_box.null_box()) { + if (NearlyEqual<int>(original_box.left(), box.left(), + kBoxClipTolerance)) { + box.set_left(original_box.left()); + } + if (NearlyEqual<int>(original_box.right(), box.right(), + kBoxClipTolerance)) { + box.set_right(original_box.right()); + } + if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance)) { + box.set_top(original_box.top()); + } + if (NearlyEqual<int>(original_box.bottom(), box.bottom(), + kBoxClipTolerance)) { + box.set_bottom(original_box.bottom()); + } + } + original_box = original_word->bounding_box(); + if (block != nullptr) { + original_box.rotate(block->re_rotation()); + } + boxes_[i] = box.intersection(original_box); + } + ComputeBoundingBox(); +} + +// Merges the boxes from start to end, not including end, and deletes +// the boxes between start and end. +void BoxWord::MergeBoxes(unsigned start, unsigned end) { + start = ClipToRange(start, 0U, length_); + end = ClipToRange(end, 0U, length_); + if (end <= start + 1) { + return; + } + for (unsigned i = start + 1; i < end; ++i) { + boxes_[start] += boxes_[i]; + } + int shrinkage = end - 1 - start; + length_ -= shrinkage; + for (unsigned i = start + 1; i < length_; ++i) { + boxes_[i] = boxes_[i + shrinkage]; + } + boxes_.resize(length_); +} + +// Inserts a new box before the given index. +// Recomputes the bounding box. +void BoxWord::InsertBox(unsigned index, const TBOX &box) { + if (index < length_) { + boxes_.insert(boxes_.begin() + index, box); + } else { + boxes_.push_back(box); + } + length_ = boxes_.size(); + ComputeBoundingBox(); +} + +// Changes the box at the given index to the new box. +// Recomputes the bounding box. +void BoxWord::ChangeBox(unsigned index, const TBOX &box) { + boxes_[index] = box; + ComputeBoundingBox(); +} + +// Deletes the box with the given index, and shuffles up the rest. +// Recomputes the bounding box. +void BoxWord::DeleteBox(unsigned index) { + ASSERT_HOST(index < length_); + boxes_.erase(boxes_.begin() + index); + --length_; + ComputeBoundingBox(); +} + +// Deletes all the boxes stored in BoxWord. +void BoxWord::DeleteAllBoxes() { + length_ = 0; + boxes_.clear(); + bbox_ = TBOX(); +} + +// Computes the bounding box of the word. +void BoxWord::ComputeBoundingBox() { + bbox_ = TBOX(); + for (unsigned i = 0; i < length_; ++i) { + bbox_ += boxes_[i]; + } +} + +// This and other putatively are the same, so call the (permanent) callback +// for each blob index where the bounding boxes match. +// The callback is deleted on completion. +void BoxWord::ProcessMatchedBlobs(const TWERD &other, + const std::function<void(int)> &cb) const { + for (unsigned i = 0; i < length_ && i < other.NumBlobs(); ++i) { + TBOX blob_box = other.blobs[i]->bounding_box(); + if (blob_box == boxes_[i]) { + cb(i); + } + } +} + +} // namespace tesseract.
