Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/edgblob.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/edgblob.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,470 @@ +/********************************************************************** + * File: edgblob.cpp (Formerly edgeloop.c) + * Description: Functions to clean up an outline before approximation. + * Author: Ray Smith + * + *(C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0(the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "edgblob.h" + +#include "edgloop.h" +#include "scanedg.h" + +#define BUCKETSIZE 16 + +namespace tesseract { + +// Control parameters used in outline_complexity(), which rejects an outline +// if any one of the 3 conditions is satisfied: +// - number of children exceeds edges_max_children_per_outline +// - number of nested layers exceeds edges_max_children_layers +// - joint complexity exceeds edges_children_count_limit(as in child_count()) +static BOOL_VAR(edges_use_new_outline_complexity, false, + "Use the new outline complexity module"); +static INT_VAR(edges_max_children_per_outline, 10, + "Max number of children inside a character outline"); +static INT_VAR(edges_max_children_layers, 5, + "Max layers of nested children inside a character outline"); +static BOOL_VAR(edges_debug, false, "turn on debugging for this module"); + +static INT_VAR(edges_children_per_grandchild, 10, + "Importance ratio for chucking outlines"); +static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob"); +static BOOL_VAR(edges_children_fix, false, + "Remove boxy parents of char-like children"); +static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box"); +static INT_VAR(edges_patharea_ratio, 40, + "Max lensq/area for acceptable child outline"); +static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline"); +static double_VAR(edges_boxarea, 0.875, + "Min area fraction of grandchild for box"); + +/** + * @name OL_BUCKETS::OL_BUCKETS + * + * Construct an array of buckets for associating outlines into blobs. + */ + +OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners + ICOORD tright) + : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1), + bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1), + buckets(bxdim * bydim), + bl(bleft), + tr(tright) {} + +/** + * @name OL_BUCKETS::operator( + * + * Return a pointer to a list of C_OUTLINEs corresponding to the + * given pixel coordinates. + */ + +C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access + TDimension x, // image coords + TDimension y) { + return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim + + (x - bl.x()) / BUCKETSIZE]; +} + +C_OUTLINE_LIST *OL_BUCKETS::start_scan() { + return scan_next(buckets.begin()); +} + +C_OUTLINE_LIST *OL_BUCKETS::scan_next() { + return scan_next(it); +} + +C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) { + it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); }); + if (it == buckets.end()) + return nullptr; + return &*it; +} + +/** + * @name OL_BUCKETS::outline_complexity + * + * This is the new version of count_child. + * + * The goal of this function is to determine if an outline and its + * interiors could be part of a character blob. This is done by + * computing a "complexity" index for the outline, which is the return + * value of this function, and checking it against a threshold. + * The max_count is used for short-circuiting the recursion and forcing + * a rejection that guarantees to fail the threshold test. + * The complexity F for outline X with N children X[i] is + * F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild + * so each layer of nesting increases complexity exponentially. + * An outline can be rejected as a text blob candidate if its complexity + * is too high, has too many children(likely a container), or has too + * many layers of nested inner loops. This has the side-effect of + * flattening out boxed or reversed video text regions. + */ + +int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline + int32_t max_count, // max output + int16_t depth // recursion depth +) { + TDimension xmin, xmax; // coord limits + TDimension ymin, ymax; + C_OUTLINE *child; // current child + int32_t child_count; // no of children + int32_t grandchild_count; // no of grandchildren + C_OUTLINE_IT child_it; // search iterator + + TBOX olbox = outline->bounding_box(); + xmin = (olbox.left() - bl.x()) / BUCKETSIZE; + xmax = (olbox.right() - bl.x()) / BUCKETSIZE; + ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; + ymax = (olbox.top() - bl.y()) / BUCKETSIZE; + child_count = 0; + grandchild_count = 0; + if (++depth > edges_max_children_layers) { // nested loops are too deep + return max_count + depth; + } + + for (auto yindex = ymin; yindex <= ymax; yindex++) { + for (auto xindex = xmin; xindex <= xmax; xindex++) { + child_it.set_to_list(&buckets[yindex * bxdim + xindex]); + if (child_it.empty()) { + continue; + } + for (child_it.mark_cycle_pt(); !child_it.cycled_list(); + child_it.forward()) { + child = child_it.data(); + if (child == outline || !(*child < *outline)) { + continue; + } + child_count++; + + if (child_count > edges_max_children_per_outline) { // too fragmented + if (edges_debug) { + tprintf( + "Discard outline on child_count=%d > " + "max_children_per_outline=%d\n", + child_count, + static_cast<int32_t>(edges_max_children_per_outline)); + } + return max_count + child_count; + } + + // Compute the "complexity" of each child recursively + int32_t remaining_count = max_count - child_count - grandchild_count; + if (remaining_count > 0) { + grandchild_count += edges_children_per_grandchild * + outline_complexity(child, remaining_count, depth); + } + if (child_count + grandchild_count > max_count) { // too complex + if (edges_debug) { + tprintf( + "Discard outline on child_count=%d + grandchild_count=%d " + "> max_count=%d\n", + child_count, grandchild_count, max_count); + } + return child_count + grandchild_count; + } + } + } + } + return child_count + grandchild_count; +} + +/** + * @name OL_BUCKETS::count_children + * + * Find number of descendants of this outline. + */ +// TODO(rays) Merge with outline_complexity. +int32_t OL_BUCKETS::count_children( // recursive count + C_OUTLINE *outline, // parent outline + int32_t max_count // max output +) { + bool parent_box; // could it be boxy + TDimension xmin, xmax; // coord limits + TDimension ymin, ymax; + C_OUTLINE *child; // current child + int32_t child_count; // no of children + int32_t grandchild_count; // no of grandchildren + int32_t parent_area; // potential box + float max_parent_area; // potential box + int32_t child_area; // current child + int32_t child_length; // current child + TBOX olbox; + C_OUTLINE_IT child_it; // search iterator + + olbox = outline->bounding_box(); + xmin = (olbox.left() - bl.x()) / BUCKETSIZE; + xmax = (olbox.right() - bl.x()) / BUCKETSIZE; + ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; + ymax = (olbox.top() - bl.y()) / BUCKETSIZE; + child_count = 0; + grandchild_count = 0; + parent_area = 0; + max_parent_area = 0; + parent_box = true; + for (auto yindex = ymin; yindex <= ymax; yindex++) { + for (auto xindex = xmin; xindex <= xmax; xindex++) { + child_it.set_to_list(&buckets[yindex * bxdim + xindex]); + if (child_it.empty()) { + continue; + } + for (child_it.mark_cycle_pt(); !child_it.cycled_list(); + child_it.forward()) { + child = child_it.data(); + if (child != outline && *child < *outline) { + child_count++; + if (child_count <= max_count) { + int max_grand = + (max_count - child_count) / edges_children_per_grandchild; + if (max_grand > 0) { + grandchild_count += count_children(child, max_grand) * + edges_children_per_grandchild; + } else { + grandchild_count += count_children(child, 1); + } + } + if (child_count + grandchild_count > max_count) { + if (edges_debug) { + tprintf("Discarding parent with child count=%d, gc=%d\n", + child_count, grandchild_count); + } + return child_count + grandchild_count; + } + if (parent_area == 0) { + parent_area = outline->outer_area(); + if (parent_area < 0) { + parent_area = -parent_area; + } + max_parent_area = outline->bounding_box().area() * edges_boxarea; + if (parent_area < max_parent_area) { + parent_box = false; + } + } + if (parent_box && + (!edges_children_fix || + child->bounding_box().height() > edges_min_nonhole)) { + child_area = child->outer_area(); + if (child_area < 0) { + child_area = -child_area; + } + if (edges_children_fix) { + if (parent_area - child_area < max_parent_area) { + parent_box = false; + continue; + } + if (grandchild_count > 0) { + if (edges_debug) { + tprintf( + "Discarding parent of area %d, child area=%d, max%g " + "with gc=%d\n", + parent_area, child_area, max_parent_area, + grandchild_count); + } + return max_count + 1; + } + child_length = child->pathlength(); + if (child_length * child_length > + child_area * edges_patharea_ratio) { + if (edges_debug) { + tprintf( + "Discarding parent of area %d, child area=%d, max%g " + "with child length=%d\n", + parent_area, child_area, max_parent_area, child_length); + } + return max_count + 1; + } + } + if (child_area < child->bounding_box().area() * edges_childarea) { + if (edges_debug) { + tprintf( + "Discarding parent of area %d, child area=%d, max%g " + "with child rect=%d\n", + parent_area, child_area, max_parent_area, + child->bounding_box().area()); + } + return max_count + 1; + } + } + } + } + } + } + return child_count + grandchild_count; +} + +/** + * @name OL_BUCKETS::extract_children + * + * Find number of descendants of this outline. + */ + +void OL_BUCKETS::extract_children( // recursive count + C_OUTLINE *outline, // parent outline + C_OUTLINE_IT *it // destination iterator +) { + TDimension xmin, xmax; // coord limits + TDimension ymin, ymax; + TBOX olbox; + C_OUTLINE_IT child_it; // search iterator + + olbox = outline->bounding_box(); + xmin = (olbox.left() - bl.x()) / BUCKETSIZE; + xmax = (olbox.right() - bl.x()) / BUCKETSIZE; + ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; + ymax = (olbox.top() - bl.y()) / BUCKETSIZE; + for (auto yindex = ymin; yindex <= ymax; yindex++) { + for (auto xindex = xmin; xindex <= xmax; xindex++) { + child_it.set_to_list(&buckets[yindex * bxdim + xindex]); + for (child_it.mark_cycle_pt(); !child_it.cycled_list(); + child_it.forward()) { + if (*child_it.data() < *outline) { + it->add_after_then_move(child_it.extract()); + } + } + } + } +} + +/// @name extract_edges + +void extract_edges(Image pix, // thresholded image + BLOCK *block) { // block to scan + C_OUTLINE_LIST outlines; // outlines in block + C_OUTLINE_IT out_it = &outlines; + + block_edges(pix, &(block->pdblk), &out_it); + ICOORD bleft; // block box + ICOORD tright; + block->pdblk.bounding_box(bleft, tright); + // make blobs + outlines_to_blobs(block, bleft, tright, &outlines); +} + +/// @name fill_buckets + +static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block + OL_BUCKETS *buckets // output buckets +) { + C_OUTLINE_IT out_it = outlines; // iterator + C_OUTLINE_IT bucket_it; // iterator in bucket + + for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { + auto outline = out_it.extract(); // take off list + // get box + const TBOX &ol_box(outline->bounding_box()); + bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom())); + bucket_it.add_to_end(outline); + } +} + +/** + * @name capture_children + * + * Find all neighbouring outlines that are children of this outline + * and either move them to the output list or declare this outline + * illegal and return false. + */ + +static bool capture_children(OL_BUCKETS *buckets, // bucket sort class + C_BLOB_IT *reject_it, // dead grandchildren + C_OUTLINE_IT *blob_it // output outlines +) { + // master outline + auto outline = blob_it->data(); + // no of children + int32_t child_count; + if (edges_use_new_outline_complexity) { + child_count = + buckets->outline_complexity(outline, edges_children_count_limit, 0); + } else { + child_count = buckets->count_children(outline, edges_children_count_limit); + } + if (child_count > edges_children_count_limit) { + return false; + } + + if (child_count > 0) { + buckets->extract_children(outline, blob_it); + } + return true; +} + +/** + * @name empty_buckets + * + * Run the edge detector over the block and return a list of blobs. + */ + +static void empty_buckets(BLOCK *block, // block to scan + OL_BUCKETS *buckets // output buckets +) { + C_OUTLINE_LIST outlines; // outlines in block + // iterator + C_OUTLINE_IT out_it = &outlines; + auto start_scan = buckets->start_scan(); + if (start_scan == nullptr) { + return; + } + C_OUTLINE_IT bucket_it = start_scan; + C_BLOB_IT good_blobs = block->blob_list(); + C_BLOB_IT junk_blobs = block->reject_blobs(); + + while (!bucket_it.empty()) { + out_it.set_to_list(&outlines); + C_OUTLINE_IT parent_it; // parent outline + do { + parent_it = bucket_it; // find outermost + do { + bucket_it.forward(); + } while (!bucket_it.at_first() && + !(*parent_it.data() < *bucket_it.data())); + } while (!bucket_it.at_first()); + + // move to new list + out_it.add_after_then_move(parent_it.extract()); + // healthy blob + bool good_blob = capture_children(buckets, &junk_blobs, &out_it); + C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs, + &junk_blobs); + + if (auto l = buckets->scan_next()) + bucket_it.set_to_list(l); + else + break; + } +} + +/** + * @name outlines_to_blobs + * + * Gather together outlines into blobs using the usual bucket sort. + */ + +void outlines_to_blobs( // find blobs + BLOCK *block, // block to scan + ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) { + // make buckets + OL_BUCKETS buckets(bleft, tright); + + fill_buckets(outlines, &buckets); + empty_buckets(block, &buckets); +} + +} // namespace tesseract
