Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/edgblob.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: edgblob.cpp (Formerly edgeloop.c) | |
| 3 * Description: Functions to clean up an outline before approximation. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 *(C) Copyright 1991, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0(the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "edgblob.h" | |
| 25 | |
| 26 #include "edgloop.h" | |
| 27 #include "scanedg.h" | |
| 28 | |
| 29 #define BUCKETSIZE 16 | |
| 30 | |
| 31 namespace tesseract { | |
| 32 | |
| 33 // Control parameters used in outline_complexity(), which rejects an outline | |
| 34 // if any one of the 3 conditions is satisfied: | |
| 35 // - number of children exceeds edges_max_children_per_outline | |
| 36 // - number of nested layers exceeds edges_max_children_layers | |
| 37 // - joint complexity exceeds edges_children_count_limit(as in child_count()) | |
| 38 static BOOL_VAR(edges_use_new_outline_complexity, false, | |
| 39 "Use the new outline complexity module"); | |
| 40 static INT_VAR(edges_max_children_per_outline, 10, | |
| 41 "Max number of children inside a character outline"); | |
| 42 static INT_VAR(edges_max_children_layers, 5, | |
| 43 "Max layers of nested children inside a character outline"); | |
| 44 static BOOL_VAR(edges_debug, false, "turn on debugging for this module"); | |
| 45 | |
| 46 static INT_VAR(edges_children_per_grandchild, 10, | |
| 47 "Importance ratio for chucking outlines"); | |
| 48 static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob"); | |
| 49 static BOOL_VAR(edges_children_fix, false, | |
| 50 "Remove boxy parents of char-like children"); | |
| 51 static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box"); | |
| 52 static INT_VAR(edges_patharea_ratio, 40, | |
| 53 "Max lensq/area for acceptable child outline"); | |
| 54 static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline"); | |
| 55 static double_VAR(edges_boxarea, 0.875, | |
| 56 "Min area fraction of grandchild for box"); | |
| 57 | |
| 58 /** | |
| 59 * @name OL_BUCKETS::OL_BUCKETS | |
| 60 * | |
| 61 * Construct an array of buckets for associating outlines into blobs. | |
| 62 */ | |
| 63 | |
| 64 OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners | |
| 65 ICOORD tright) | |
| 66 : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1), | |
| 67 bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1), | |
| 68 buckets(bxdim * bydim), | |
| 69 bl(bleft), | |
| 70 tr(tright) {} | |
| 71 | |
| 72 /** | |
| 73 * @name OL_BUCKETS::operator( | |
| 74 * | |
| 75 * Return a pointer to a list of C_OUTLINEs corresponding to the | |
| 76 * given pixel coordinates. | |
| 77 */ | |
| 78 | |
| 79 C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access | |
| 80 TDimension x, // image coords | |
| 81 TDimension y) { | |
| 82 return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim + | |
| 83 (x - bl.x()) / BUCKETSIZE]; | |
| 84 } | |
| 85 | |
| 86 C_OUTLINE_LIST *OL_BUCKETS::start_scan() { | |
| 87 return scan_next(buckets.begin()); | |
| 88 } | |
| 89 | |
| 90 C_OUTLINE_LIST *OL_BUCKETS::scan_next() { | |
| 91 return scan_next(it); | |
| 92 } | |
| 93 | |
| 94 C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) { | |
| 95 it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); }); | |
| 96 if (it == buckets.end()) | |
| 97 return nullptr; | |
| 98 return &*it; | |
| 99 } | |
| 100 | |
| 101 /** | |
| 102 * @name OL_BUCKETS::outline_complexity | |
| 103 * | |
| 104 * This is the new version of count_child. | |
| 105 * | |
| 106 * The goal of this function is to determine if an outline and its | |
| 107 * interiors could be part of a character blob. This is done by | |
| 108 * computing a "complexity" index for the outline, which is the return | |
| 109 * value of this function, and checking it against a threshold. | |
| 110 * The max_count is used for short-circuiting the recursion and forcing | |
| 111 * a rejection that guarantees to fail the threshold test. | |
| 112 * The complexity F for outline X with N children X[i] is | |
| 113 * F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild | |
| 114 * so each layer of nesting increases complexity exponentially. | |
| 115 * An outline can be rejected as a text blob candidate if its complexity | |
| 116 * is too high, has too many children(likely a container), or has too | |
| 117 * many layers of nested inner loops. This has the side-effect of | |
| 118 * flattening out boxed or reversed video text regions. | |
| 119 */ | |
| 120 | |
| 121 int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline | |
| 122 int32_t max_count, // max output | |
| 123 int16_t depth // recursion depth | |
| 124 ) { | |
| 125 TDimension xmin, xmax; // coord limits | |
| 126 TDimension ymin, ymax; | |
| 127 C_OUTLINE *child; // current child | |
| 128 int32_t child_count; // no of children | |
| 129 int32_t grandchild_count; // no of grandchildren | |
| 130 C_OUTLINE_IT child_it; // search iterator | |
| 131 | |
| 132 TBOX olbox = outline->bounding_box(); | |
| 133 xmin = (olbox.left() - bl.x()) / BUCKETSIZE; | |
| 134 xmax = (olbox.right() - bl.x()) / BUCKETSIZE; | |
| 135 ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; | |
| 136 ymax = (olbox.top() - bl.y()) / BUCKETSIZE; | |
| 137 child_count = 0; | |
| 138 grandchild_count = 0; | |
| 139 if (++depth > edges_max_children_layers) { // nested loops are too deep | |
| 140 return max_count + depth; | |
| 141 } | |
| 142 | |
| 143 for (auto yindex = ymin; yindex <= ymax; yindex++) { | |
| 144 for (auto xindex = xmin; xindex <= xmax; xindex++) { | |
| 145 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); | |
| 146 if (child_it.empty()) { | |
| 147 continue; | |
| 148 } | |
| 149 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); | |
| 150 child_it.forward()) { | |
| 151 child = child_it.data(); | |
| 152 if (child == outline || !(*child < *outline)) { | |
| 153 continue; | |
| 154 } | |
| 155 child_count++; | |
| 156 | |
| 157 if (child_count > edges_max_children_per_outline) { // too fragmented | |
| 158 if (edges_debug) { | |
| 159 tprintf( | |
| 160 "Discard outline on child_count=%d > " | |
| 161 "max_children_per_outline=%d\n", | |
| 162 child_count, | |
| 163 static_cast<int32_t>(edges_max_children_per_outline)); | |
| 164 } | |
| 165 return max_count + child_count; | |
| 166 } | |
| 167 | |
| 168 // Compute the "complexity" of each child recursively | |
| 169 int32_t remaining_count = max_count - child_count - grandchild_count; | |
| 170 if (remaining_count > 0) { | |
| 171 grandchild_count += edges_children_per_grandchild * | |
| 172 outline_complexity(child, remaining_count, depth); | |
| 173 } | |
| 174 if (child_count + grandchild_count > max_count) { // too complex | |
| 175 if (edges_debug) { | |
| 176 tprintf( | |
| 177 "Discard outline on child_count=%d + grandchild_count=%d " | |
| 178 "> max_count=%d\n", | |
| 179 child_count, grandchild_count, max_count); | |
| 180 } | |
| 181 return child_count + grandchild_count; | |
| 182 } | |
| 183 } | |
| 184 } | |
| 185 } | |
| 186 return child_count + grandchild_count; | |
| 187 } | |
| 188 | |
| 189 /** | |
| 190 * @name OL_BUCKETS::count_children | |
| 191 * | |
| 192 * Find number of descendants of this outline. | |
| 193 */ | |
| 194 // TODO(rays) Merge with outline_complexity. | |
| 195 int32_t OL_BUCKETS::count_children( // recursive count | |
| 196 C_OUTLINE *outline, // parent outline | |
| 197 int32_t max_count // max output | |
| 198 ) { | |
| 199 bool parent_box; // could it be boxy | |
| 200 TDimension xmin, xmax; // coord limits | |
| 201 TDimension ymin, ymax; | |
| 202 C_OUTLINE *child; // current child | |
| 203 int32_t child_count; // no of children | |
| 204 int32_t grandchild_count; // no of grandchildren | |
| 205 int32_t parent_area; // potential box | |
| 206 float max_parent_area; // potential box | |
| 207 int32_t child_area; // current child | |
| 208 int32_t child_length; // current child | |
| 209 TBOX olbox; | |
| 210 C_OUTLINE_IT child_it; // search iterator | |
| 211 | |
| 212 olbox = outline->bounding_box(); | |
| 213 xmin = (olbox.left() - bl.x()) / BUCKETSIZE; | |
| 214 xmax = (olbox.right() - bl.x()) / BUCKETSIZE; | |
| 215 ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; | |
| 216 ymax = (olbox.top() - bl.y()) / BUCKETSIZE; | |
| 217 child_count = 0; | |
| 218 grandchild_count = 0; | |
| 219 parent_area = 0; | |
| 220 max_parent_area = 0; | |
| 221 parent_box = true; | |
| 222 for (auto yindex = ymin; yindex <= ymax; yindex++) { | |
| 223 for (auto xindex = xmin; xindex <= xmax; xindex++) { | |
| 224 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); | |
| 225 if (child_it.empty()) { | |
| 226 continue; | |
| 227 } | |
| 228 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); | |
| 229 child_it.forward()) { | |
| 230 child = child_it.data(); | |
| 231 if (child != outline && *child < *outline) { | |
| 232 child_count++; | |
| 233 if (child_count <= max_count) { | |
| 234 int max_grand = | |
| 235 (max_count - child_count) / edges_children_per_grandchild; | |
| 236 if (max_grand > 0) { | |
| 237 grandchild_count += count_children(child, max_grand) * | |
| 238 edges_children_per_grandchild; | |
| 239 } else { | |
| 240 grandchild_count += count_children(child, 1); | |
| 241 } | |
| 242 } | |
| 243 if (child_count + grandchild_count > max_count) { | |
| 244 if (edges_debug) { | |
| 245 tprintf("Discarding parent with child count=%d, gc=%d\n", | |
| 246 child_count, grandchild_count); | |
| 247 } | |
| 248 return child_count + grandchild_count; | |
| 249 } | |
| 250 if (parent_area == 0) { | |
| 251 parent_area = outline->outer_area(); | |
| 252 if (parent_area < 0) { | |
| 253 parent_area = -parent_area; | |
| 254 } | |
| 255 max_parent_area = outline->bounding_box().area() * edges_boxarea; | |
| 256 if (parent_area < max_parent_area) { | |
| 257 parent_box = false; | |
| 258 } | |
| 259 } | |
| 260 if (parent_box && | |
| 261 (!edges_children_fix || | |
| 262 child->bounding_box().height() > edges_min_nonhole)) { | |
| 263 child_area = child->outer_area(); | |
| 264 if (child_area < 0) { | |
| 265 child_area = -child_area; | |
| 266 } | |
| 267 if (edges_children_fix) { | |
| 268 if (parent_area - child_area < max_parent_area) { | |
| 269 parent_box = false; | |
| 270 continue; | |
| 271 } | |
| 272 if (grandchild_count > 0) { | |
| 273 if (edges_debug) { | |
| 274 tprintf( | |
| 275 "Discarding parent of area %d, child area=%d, max%g " | |
| 276 "with gc=%d\n", | |
| 277 parent_area, child_area, max_parent_area, | |
| 278 grandchild_count); | |
| 279 } | |
| 280 return max_count + 1; | |
| 281 } | |
| 282 child_length = child->pathlength(); | |
| 283 if (child_length * child_length > | |
| 284 child_area * edges_patharea_ratio) { | |
| 285 if (edges_debug) { | |
| 286 tprintf( | |
| 287 "Discarding parent of area %d, child area=%d, max%g " | |
| 288 "with child length=%d\n", | |
| 289 parent_area, child_area, max_parent_area, child_length); | |
| 290 } | |
| 291 return max_count + 1; | |
| 292 } | |
| 293 } | |
| 294 if (child_area < child->bounding_box().area() * edges_childarea) { | |
| 295 if (edges_debug) { | |
| 296 tprintf( | |
| 297 "Discarding parent of area %d, child area=%d, max%g " | |
| 298 "with child rect=%d\n", | |
| 299 parent_area, child_area, max_parent_area, | |
| 300 child->bounding_box().area()); | |
| 301 } | |
| 302 return max_count + 1; | |
| 303 } | |
| 304 } | |
| 305 } | |
| 306 } | |
| 307 } | |
| 308 } | |
| 309 return child_count + grandchild_count; | |
| 310 } | |
| 311 | |
| 312 /** | |
| 313 * @name OL_BUCKETS::extract_children | |
| 314 * | |
| 315 * Find number of descendants of this outline. | |
| 316 */ | |
| 317 | |
| 318 void OL_BUCKETS::extract_children( // recursive count | |
| 319 C_OUTLINE *outline, // parent outline | |
| 320 C_OUTLINE_IT *it // destination iterator | |
| 321 ) { | |
| 322 TDimension xmin, xmax; // coord limits | |
| 323 TDimension ymin, ymax; | |
| 324 TBOX olbox; | |
| 325 C_OUTLINE_IT child_it; // search iterator | |
| 326 | |
| 327 olbox = outline->bounding_box(); | |
| 328 xmin = (olbox.left() - bl.x()) / BUCKETSIZE; | |
| 329 xmax = (olbox.right() - bl.x()) / BUCKETSIZE; | |
| 330 ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; | |
| 331 ymax = (olbox.top() - bl.y()) / BUCKETSIZE; | |
| 332 for (auto yindex = ymin; yindex <= ymax; yindex++) { | |
| 333 for (auto xindex = xmin; xindex <= xmax; xindex++) { | |
| 334 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); | |
| 335 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); | |
| 336 child_it.forward()) { | |
| 337 if (*child_it.data() < *outline) { | |
| 338 it->add_after_then_move(child_it.extract()); | |
| 339 } | |
| 340 } | |
| 341 } | |
| 342 } | |
| 343 } | |
| 344 | |
| 345 /// @name extract_edges | |
| 346 | |
| 347 void extract_edges(Image pix, // thresholded image | |
| 348 BLOCK *block) { // block to scan | |
| 349 C_OUTLINE_LIST outlines; // outlines in block | |
| 350 C_OUTLINE_IT out_it = &outlines; | |
| 351 | |
| 352 block_edges(pix, &(block->pdblk), &out_it); | |
| 353 ICOORD bleft; // block box | |
| 354 ICOORD tright; | |
| 355 block->pdblk.bounding_box(bleft, tright); | |
| 356 // make blobs | |
| 357 outlines_to_blobs(block, bleft, tright, &outlines); | |
| 358 } | |
| 359 | |
| 360 /// @name fill_buckets | |
| 361 | |
| 362 static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block | |
| 363 OL_BUCKETS *buckets // output buckets | |
| 364 ) { | |
| 365 C_OUTLINE_IT out_it = outlines; // iterator | |
| 366 C_OUTLINE_IT bucket_it; // iterator in bucket | |
| 367 | |
| 368 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { | |
| 369 auto outline = out_it.extract(); // take off list | |
| 370 // get box | |
| 371 const TBOX &ol_box(outline->bounding_box()); | |
| 372 bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom())); | |
| 373 bucket_it.add_to_end(outline); | |
| 374 } | |
| 375 } | |
| 376 | |
| 377 /** | |
| 378 * @name capture_children | |
| 379 * | |
| 380 * Find all neighbouring outlines that are children of this outline | |
| 381 * and either move them to the output list or declare this outline | |
| 382 * illegal and return false. | |
| 383 */ | |
| 384 | |
| 385 static bool capture_children(OL_BUCKETS *buckets, // bucket sort class | |
| 386 C_BLOB_IT *reject_it, // dead grandchildren | |
| 387 C_OUTLINE_IT *blob_it // output outlines | |
| 388 ) { | |
| 389 // master outline | |
| 390 auto outline = blob_it->data(); | |
| 391 // no of children | |
| 392 int32_t child_count; | |
| 393 if (edges_use_new_outline_complexity) { | |
| 394 child_count = | |
| 395 buckets->outline_complexity(outline, edges_children_count_limit, 0); | |
| 396 } else { | |
| 397 child_count = buckets->count_children(outline, edges_children_count_limit); | |
| 398 } | |
| 399 if (child_count > edges_children_count_limit) { | |
| 400 return false; | |
| 401 } | |
| 402 | |
| 403 if (child_count > 0) { | |
| 404 buckets->extract_children(outline, blob_it); | |
| 405 } | |
| 406 return true; | |
| 407 } | |
| 408 | |
| 409 /** | |
| 410 * @name empty_buckets | |
| 411 * | |
| 412 * Run the edge detector over the block and return a list of blobs. | |
| 413 */ | |
| 414 | |
| 415 static void empty_buckets(BLOCK *block, // block to scan | |
| 416 OL_BUCKETS *buckets // output buckets | |
| 417 ) { | |
| 418 C_OUTLINE_LIST outlines; // outlines in block | |
| 419 // iterator | |
| 420 C_OUTLINE_IT out_it = &outlines; | |
| 421 auto start_scan = buckets->start_scan(); | |
| 422 if (start_scan == nullptr) { | |
| 423 return; | |
| 424 } | |
| 425 C_OUTLINE_IT bucket_it = start_scan; | |
| 426 C_BLOB_IT good_blobs = block->blob_list(); | |
| 427 C_BLOB_IT junk_blobs = block->reject_blobs(); | |
| 428 | |
| 429 while (!bucket_it.empty()) { | |
| 430 out_it.set_to_list(&outlines); | |
| 431 C_OUTLINE_IT parent_it; // parent outline | |
| 432 do { | |
| 433 parent_it = bucket_it; // find outermost | |
| 434 do { | |
| 435 bucket_it.forward(); | |
| 436 } while (!bucket_it.at_first() && | |
| 437 !(*parent_it.data() < *bucket_it.data())); | |
| 438 } while (!bucket_it.at_first()); | |
| 439 | |
| 440 // move to new list | |
| 441 out_it.add_after_then_move(parent_it.extract()); | |
| 442 // healthy blob | |
| 443 bool good_blob = capture_children(buckets, &junk_blobs, &out_it); | |
| 444 C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs, | |
| 445 &junk_blobs); | |
| 446 | |
| 447 if (auto l = buckets->scan_next()) | |
| 448 bucket_it.set_to_list(l); | |
| 449 else | |
| 450 break; | |
| 451 } | |
| 452 } | |
| 453 | |
| 454 /** | |
| 455 * @name outlines_to_blobs | |
| 456 * | |
| 457 * Gather together outlines into blobs using the usual bucket sort. | |
| 458 */ | |
| 459 | |
| 460 void outlines_to_blobs( // find blobs | |
| 461 BLOCK *block, // block to scan | |
| 462 ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) { | |
| 463 // make buckets | |
| 464 OL_BUCKETS buckets(bleft, tright); | |
| 465 | |
| 466 fill_buckets(outlines, &buckets); | |
| 467 empty_buckets(block, &buckets); | |
| 468 } | |
| 469 | |
| 470 } // namespace tesseract |
