Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/ccnontextdetect.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: ccnontextdetect.cpp | |
| 3 // Description: Connected-Component-based photo (non-text) detection. | |
| 4 // Author: rays@google.com (Ray Smith) | |
| 5 // | |
| 6 // Copyright 2011 Google Inc. All Rights Reserved. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifdef HAVE_CONFIG_H | |
| 20 # include "config_auto.h" | |
| 21 #endif | |
| 22 | |
| 23 #include "ccnontextdetect.h" | |
| 24 #include "helpers.h" // for IntCastRounded | |
| 25 #include "imagefind.h" | |
| 26 #include "strokewidth.h" | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 // Max number of neighbour small objects per squared gridsize before a grid | |
| 31 // cell becomes image. | |
| 32 const double kMaxSmallNeighboursPerPix = 1.0 / 32; | |
| 33 // Max number of small blobs a large blob may overlap before it is rejected | |
| 34 // and determined to be image. | |
| 35 const int kMaxLargeOverlapsWithSmall = 3; | |
| 36 // Max number of small blobs a medium blob may overlap before it is rejected | |
| 37 // and determined to be image. Larger than for large blobs as medium blobs | |
| 38 // may be complex Chinese characters. Very large Chinese characters are going | |
| 39 // to overlap more medium blobs than small. | |
| 40 const int kMaxMediumOverlapsWithSmall = 12; | |
| 41 // Max number of normal blobs a large blob may overlap before it is rejected | |
| 42 // and determined to be image. This is set higher to allow for drop caps, which | |
| 43 // may overlap a lot of good text blobs. | |
| 44 const int kMaxLargeOverlapsWithMedium = 12; | |
| 45 // Multiplier of original noise_count used to test for the case of spreading | |
| 46 // noise beyond where it should really be. | |
| 47 const int kOriginalNoiseMultiple = 8; | |
| 48 // Pixel padding for noise blobs when rendering on the image | |
| 49 // mask to encourage them to join together. Make it too big and images | |
| 50 // will fatten out too much and have to be clipped to text. | |
| 51 const int kNoisePadding = 4; | |
| 52 // Fraction of max_noise_count_ to be added to the noise count if there is | |
| 53 // photo mask in the background. | |
| 54 const double kPhotoOffsetFraction = 0.375; | |
| 55 // Min ratio of perimeter^2/16area for a "good" blob in estimating noise | |
| 56 // density. Good blobs are supposed to be highly likely real text. | |
| 57 // We consider a square to have unit ratio, where A=(p/4)^2, hence the factor | |
| 58 // of 16. Digital circles are weird and have a minimum ratio of pi/64, not | |
| 59 // the 1/(4pi) that you would expect. | |
| 60 const double kMinGoodTextPARatio = 1.5; | |
| 61 | |
| 62 CCNonTextDetect::CCNonTextDetect(int gridsize, const ICOORD &bleft, const ICOORD &tright) | |
| 63 : BlobGrid(gridsize, bleft, tright) | |
| 64 , max_noise_count_(static_cast<int>(kMaxSmallNeighboursPerPix * gridsize * gridsize)) | |
| 65 , noise_density_(nullptr) { | |
| 66 // TODO(rays) break max_noise_count_ out into an area-proportional | |
| 67 // value, as now plus an additive constant for the number of text blobs | |
| 68 // in the 3x3 neighbourhood - maybe 9. | |
| 69 } | |
| 70 | |
| 71 CCNonTextDetect::~CCNonTextDetect() { | |
| 72 delete noise_density_; | |
| 73 } | |
| 74 | |
| 75 // Creates and returns a Pix with the same resolution as the original | |
| 76 // in which 1 (black) pixels represent likely non text (photo, line drawing) | |
| 77 // areas of the page, deleting from the blob_block the blobs that were | |
| 78 // determined to be non-text. | |
| 79 // The photo_map is used to bias the decision towards non-text, rather than | |
| 80 // supplying definite decision. | |
| 81 // The blob_block is the usual result of connected component analysis, | |
| 82 // holding the detected blobs. | |
| 83 // The returned Pix should be PixDestroyed after use. | |
| 84 Image CCNonTextDetect::ComputeNonTextMask(bool debug, Image photo_map, TO_BLOCK *blob_block) { | |
| 85 // Insert the smallest blobs into the grid. | |
| 86 InsertBlobList(&blob_block->small_blobs); | |
| 87 InsertBlobList(&blob_block->noise_blobs); | |
| 88 // Add the medium blobs that don't have a good strokewidth neighbour. | |
| 89 // Those that do go into good_grid as an antidote to spreading beyond the | |
| 90 // real reaches of a noise region. | |
| 91 BlobGrid good_grid(gridsize(), bleft(), tright()); | |
| 92 BLOBNBOX_IT blob_it(&blob_block->blobs); | |
| 93 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 94 BLOBNBOX *blob = blob_it.data(); | |
| 95 double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0; | |
| 96 perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area(); | |
| 97 if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio) { | |
| 98 InsertBBox(true, true, blob); | |
| 99 } else { | |
| 100 good_grid.InsertBBox(true, true, blob); | |
| 101 } | |
| 102 } | |
| 103 noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid); | |
| 104 good_grid.Clear(); // Not needed any more. | |
| 105 Image pix = noise_density_->ThresholdToPix(max_noise_count_); | |
| 106 if (debug) { | |
| 107 pixWrite("junknoisemask.png", pix, IFF_PNG); | |
| 108 } | |
| 109 ScrollView *win = nullptr; | |
| 110 #ifndef GRAPHICS_DISABLED | |
| 111 if (debug) { | |
| 112 win = MakeWindow(0, 400, "Photo Mask Blobs"); | |
| 113 } | |
| 114 #endif // !GRAPHICS_DISABLED | |
| 115 // Large and medium blobs are not text if they overlap with "a lot" of small | |
| 116 // blobs. | |
| 117 MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithSmall, win, | |
| 118 ScrollView::DARK_GREEN, pix); | |
| 119 MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall, win, ScrollView::WHITE, | |
| 120 pix); | |
| 121 // Clear the grid of small blobs and insert the medium blobs. | |
| 122 Clear(); | |
| 123 InsertBlobList(&blob_block->blobs); | |
| 124 MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithMedium, win, | |
| 125 ScrollView::DARK_GREEN, pix); | |
| 126 // Clear again before we start deleting the blobs in the grid. | |
| 127 Clear(); | |
| 128 MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1, win, ScrollView::CORAL, pix); | |
| 129 MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1, win, ScrollView::GOLDENROD, pix); | |
| 130 MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1, win, ScrollView::WHITE, pix); | |
| 131 if (debug) { | |
| 132 #ifndef GRAPHICS_DISABLED | |
| 133 win->Update(); | |
| 134 #endif // !GRAPHICS_DISABLED | |
| 135 pixWrite("junkccphotomask.png", pix, IFF_PNG); | |
| 136 #ifndef GRAPHICS_DISABLED | |
| 137 win->AwaitEvent(SVET_DESTROY); | |
| 138 delete win; | |
| 139 #endif // !GRAPHICS_DISABLED | |
| 140 } | |
| 141 return pix; | |
| 142 } | |
| 143 | |
| 144 // Computes and returns the noise_density IntGrid, at the same gridsize as | |
| 145 // this by summing the number of small elements in a 3x3 neighbourhood of | |
| 146 // each grid cell. good_grid is filled with blobs that are considered most | |
| 147 // likely good text, and this is filled with small and medium blobs that are | |
| 148 // more likely non-text. | |
| 149 // The photo_map is used to bias the decision towards non-text, rather than | |
| 150 // supplying definite decision. | |
| 151 IntGrid *CCNonTextDetect::ComputeNoiseDensity(bool debug, Image photo_map, BlobGrid *good_grid) { | |
| 152 IntGrid *noise_counts = CountCellElements(); | |
| 153 IntGrid *noise_density = noise_counts->NeighbourhoodSum(); | |
| 154 IntGrid *good_counts = good_grid->CountCellElements(); | |
| 155 // Now increase noise density in photo areas, to bias the decision and | |
| 156 // minimize hallucinated text on image, but trim the noise_density where | |
| 157 // there are good blobs and the original count is low in non-photo areas, | |
| 158 // indicating that most of the result came from neighbouring cells. | |
| 159 int height = pixGetHeight(photo_map); | |
| 160 int photo_offset = IntCastRounded(max_noise_count_ * kPhotoOffsetFraction); | |
| 161 for (int y = 0; y < gridheight(); ++y) { | |
| 162 for (int x = 0; x < gridwidth(); ++x) { | |
| 163 int noise = noise_density->GridCellValue(x, y); | |
| 164 if (max_noise_count_ < noise + photo_offset && noise <= max_noise_count_) { | |
| 165 // Test for photo. | |
| 166 int left = x * gridsize(); | |
| 167 int right = left + gridsize(); | |
| 168 int bottom = height - y * gridsize(); | |
| 169 int top = bottom - gridsize(); | |
| 170 if (ImageFind::BoundsWithinRect(photo_map, &left, &top, &right, &bottom)) { | |
| 171 noise_density->SetGridCell(x, y, noise + photo_offset); | |
| 172 } | |
| 173 } | |
| 174 if (debug && noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0) { | |
| 175 tprintf("At %d, %d, noise = %d, good=%d, orig=%d, thr=%d\n", x * gridsize(), y * gridsize(), | |
| 176 noise_density->GridCellValue(x, y), good_counts->GridCellValue(x, y), | |
| 177 noise_counts->GridCellValue(x, y), max_noise_count_); | |
| 178 } | |
| 179 if (noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0 && | |
| 180 noise_counts->GridCellValue(x, y) * kOriginalNoiseMultiple <= max_noise_count_) { | |
| 181 noise_density->SetGridCell(x, y, 0); | |
| 182 } | |
| 183 } | |
| 184 } | |
| 185 delete noise_counts; | |
| 186 delete good_counts; | |
| 187 return noise_density; | |
| 188 } | |
| 189 | |
| 190 // Helper to expand a box in one of the 4 directions by the given pad, | |
| 191 // provided it does not expand into any cell with a zero noise density. | |
| 192 // If that is not possible, try expanding all round by a small constant. | |
| 193 static TBOX AttemptBoxExpansion(const TBOX &box, const IntGrid &noise_density, int pad) { | |
| 194 TBOX expanded_box(box); | |
| 195 expanded_box.set_right(box.right() + pad); | |
| 196 if (!noise_density.AnyZeroInRect(expanded_box)) { | |
| 197 return expanded_box; | |
| 198 } | |
| 199 expanded_box = box; | |
| 200 expanded_box.set_left(box.left() - pad); | |
| 201 if (!noise_density.AnyZeroInRect(expanded_box)) { | |
| 202 return expanded_box; | |
| 203 } | |
| 204 expanded_box = box; | |
| 205 expanded_box.set_top(box.top() + pad); | |
| 206 if (!noise_density.AnyZeroInRect(expanded_box)) { | |
| 207 return expanded_box; | |
| 208 } | |
| 209 expanded_box = box; | |
| 210 expanded_box.set_bottom(box.bottom() + pad); | |
| 211 if (!noise_density.AnyZeroInRect(expanded_box)) { | |
| 212 return expanded_box; | |
| 213 } | |
| 214 expanded_box = box; | |
| 215 expanded_box.pad(kNoisePadding, kNoisePadding); | |
| 216 if (!noise_density.AnyZeroInRect(expanded_box)) { | |
| 217 return expanded_box; | |
| 218 } | |
| 219 return box; | |
| 220 } | |
| 221 | |
| 222 // Tests each blob in the list to see if it is certain non-text using 2 | |
| 223 // conditions: | |
| 224 // 1. blob overlaps a cell with high value in noise_density_ (previously set | |
| 225 // by ComputeNoiseDensity). | |
| 226 // OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This | |
| 227 // condition is disabled with max_blob_overlaps == -1. | |
| 228 // If it does, the blob is declared non-text, and is used to mark up the | |
| 229 // nontext_mask. Such blobs are fully deleted, and non-noise blobs have their | |
| 230 // neighbours reset, as they may now point to deleted data. | |
| 231 // WARNING: The blobs list blobs may be in the *this grid, but they are | |
| 232 // not removed. If any deleted blobs might be in *this, then this must be | |
| 233 // Clear()ed immediately after MarkAndDeleteNonTextBlobs is called. | |
| 234 // If the win is not nullptr, deleted blobs are drawn on it in red, and kept | |
| 235 // blobs are drawn on it in ok_color. | |
| 236 void CCNonTextDetect::MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST *blobs, int max_blob_overlaps, | |
| 237 ScrollView *win, ScrollView::Color ok_color, | |
| 238 Image nontext_mask) { | |
| 239 int imageheight = tright().y() - bleft().x(); | |
| 240 BLOBNBOX_IT blob_it(blobs); | |
| 241 BLOBNBOX_LIST dead_blobs; | |
| 242 BLOBNBOX_IT dead_it(&dead_blobs); | |
| 243 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 244 BLOBNBOX *blob = blob_it.data(); | |
| 245 TBOX box = blob->bounding_box(); | |
| 246 if (!noise_density_->RectMostlyOverThreshold(box, max_noise_count_) && | |
| 247 (max_blob_overlaps < 0 || !BlobOverlapsTooMuch(blob, max_blob_overlaps))) { | |
| 248 blob->ClearNeighbours(); | |
| 249 #ifndef GRAPHICS_DISABLED | |
| 250 if (win != nullptr) { | |
| 251 blob->plot(win, ok_color, ok_color); | |
| 252 } | |
| 253 #endif // !GRAPHICS_DISABLED | |
| 254 } else { | |
| 255 if (noise_density_->AnyZeroInRect(box)) { | |
| 256 // There is a danger that the bounding box may overlap real text, so | |
| 257 // we need to render the outline. | |
| 258 Image blob_pix = blob->cblob()->render_outline(); | |
| 259 pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(), | |
| 260 PIX_SRC | PIX_DST, blob_pix, 0, 0); | |
| 261 blob_pix.destroy(); | |
| 262 } else { | |
| 263 if (box.area() < gridsize() * gridsize()) { | |
| 264 // It is a really bad idea to make lots of small components in the | |
| 265 // photo mask, so try to join it to a bigger area by expanding the | |
| 266 // box in a way that does not touch any zero noise density cell. | |
| 267 box = AttemptBoxExpansion(box, *noise_density_, gridsize()); | |
| 268 } | |
| 269 // All overlapped cells are non-zero, so just mark the rectangle. | |
| 270 pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(), | |
| 271 PIX_SET, nullptr, 0, 0); | |
| 272 } | |
| 273 #ifndef GRAPHICS_DISABLED | |
| 274 if (win != nullptr) { | |
| 275 blob->plot(win, ScrollView::RED, ScrollView::RED); | |
| 276 } | |
| 277 #endif // !GRAPHICS_DISABLED | |
| 278 // It is safe to delete the cblob now, as it isn't used by the grid | |
| 279 // or BlobOverlapsTooMuch, and the BLOBNBOXes will go away with the | |
| 280 // dead_blobs list. | |
| 281 // TODO: remove next line, currently still needed for resultiterator_test. | |
| 282 delete blob->remove_cblob(); | |
| 283 dead_it.add_to_end(blob_it.extract()); | |
| 284 } | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 // Returns true if the given blob overlaps more than max_overlaps blobs | |
| 289 // in the current grid. | |
| 290 bool CCNonTextDetect::BlobOverlapsTooMuch(BLOBNBOX *blob, int max_overlaps) { | |
| 291 // Search the grid to see what intersects it. | |
| 292 // Setup a Rectangle search for overlapping this blob. | |
| 293 BlobGridSearch rsearch(this); | |
| 294 const TBOX &box = blob->bounding_box(); | |
| 295 rsearch.StartRectSearch(box); | |
| 296 rsearch.SetUniqueMode(true); | |
| 297 BLOBNBOX *neighbour; | |
| 298 int overlap_count = 0; | |
| 299 while (overlap_count <= max_overlaps && (neighbour = rsearch.NextRectSearch()) != nullptr) { | |
| 300 if (box.major_overlap(neighbour->bounding_box())) { | |
| 301 ++overlap_count; | |
| 302 if (overlap_count > max_overlaps) { | |
| 303 return true; | |
| 304 } | |
| 305 } | |
| 306 } | |
| 307 return false; | |
| 308 } | |
| 309 | |
| 310 } // namespace tesseract. |
