diff mupdf-source/thirdparty/tesseract/src/textord/ccnontextdetect.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/textord/ccnontextdetect.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,310 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ccnontextdetect.cpp
+// Description: Connected-Component-based photo (non-text) detection.
+// Author:      rays@google.com (Ray Smith)
+//
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "ccnontextdetect.h"
+#include "helpers.h"         // for IntCastRounded
+#include "imagefind.h"
+#include "strokewidth.h"
+
+namespace tesseract {
+
+// Max number of neighbour small objects per squared gridsize before a grid
+// cell becomes image.
+const double kMaxSmallNeighboursPerPix = 1.0 / 32;
+// Max number of small blobs a large blob may overlap before it is rejected
+// and determined to be image.
+const int kMaxLargeOverlapsWithSmall = 3;
+// Max number of small blobs a medium blob may overlap before it is rejected
+// and determined to be image. Larger than for large blobs as medium blobs
+// may be complex Chinese characters. Very large Chinese characters are going
+// to overlap more medium blobs than small.
+const int kMaxMediumOverlapsWithSmall = 12;
+// Max number of normal blobs a large blob may overlap before it is rejected
+// and determined to be image. This is set higher to allow for drop caps, which
+// may overlap a lot of good text blobs.
+const int kMaxLargeOverlapsWithMedium = 12;
+// Multiplier of original noise_count used to test for the case of spreading
+// noise beyond where it should really be.
+const int kOriginalNoiseMultiple = 8;
+// Pixel padding for noise blobs when rendering on the image
+// mask to encourage them to join together. Make it too big and images
+// will fatten out too much and have to be clipped to text.
+const int kNoisePadding = 4;
+// Fraction of max_noise_count_ to be added to the noise count if there is
+// photo mask in the background.
+const double kPhotoOffsetFraction = 0.375;
+// Min ratio of perimeter^2/16area for a "good" blob in estimating noise
+// density. Good blobs are supposed to be highly likely real text.
+// We consider a square to have unit ratio, where A=(p/4)^2, hence the factor
+// of 16. Digital circles are weird and have a minimum ratio of pi/64, not
+// the 1/(4pi) that you would expect.
+const double kMinGoodTextPARatio = 1.5;
+
+CCNonTextDetect::CCNonTextDetect(int gridsize, const ICOORD &bleft, const ICOORD &tright)
+    : BlobGrid(gridsize, bleft, tright)
+    , max_noise_count_(static_cast<int>(kMaxSmallNeighboursPerPix * gridsize * gridsize))
+    , noise_density_(nullptr) {
+  // TODO(rays) break max_noise_count_ out into an area-proportional
+  // value, as now plus an additive constant for the number of text blobs
+  // in the 3x3 neighbourhood - maybe 9.
+}
+
+CCNonTextDetect::~CCNonTextDetect() {
+  delete noise_density_;
+}
+
+// Creates and returns a Pix with the same resolution as the original
+// in which 1 (black) pixels represent likely non text (photo, line drawing)
+// areas of the page, deleting from the blob_block the blobs that were
+// determined to be non-text.
+// The photo_map is used to bias the decision towards non-text, rather than
+// supplying definite decision.
+// The blob_block is the usual result of connected component analysis,
+// holding the detected blobs.
+// The returned Pix should be PixDestroyed after use.
+Image CCNonTextDetect::ComputeNonTextMask(bool debug, Image photo_map, TO_BLOCK *blob_block) {
+  // Insert the smallest blobs into the grid.
+  InsertBlobList(&blob_block->small_blobs);
+  InsertBlobList(&blob_block->noise_blobs);
+  // Add the medium blobs that don't have a good strokewidth neighbour.
+  // Those that do go into good_grid as an antidote to spreading beyond the
+  // real reaches of a noise region.
+  BlobGrid good_grid(gridsize(), bleft(), tright());
+  BLOBNBOX_IT blob_it(&blob_block->blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX *blob = blob_it.data();
+    double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0;
+    perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area();
+    if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio) {
+      InsertBBox(true, true, blob);
+    } else {
+      good_grid.InsertBBox(true, true, blob);
+    }
+  }
+  noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid);
+  good_grid.Clear(); // Not needed any more.
+  Image pix = noise_density_->ThresholdToPix(max_noise_count_);
+  if (debug) {
+    pixWrite("junknoisemask.png", pix, IFF_PNG);
+  }
+  ScrollView *win = nullptr;
+#ifndef GRAPHICS_DISABLED
+  if (debug) {
+    win = MakeWindow(0, 400, "Photo Mask Blobs");
+  }
+#endif // !GRAPHICS_DISABLED
+  // Large and medium blobs are not text if they overlap with "a lot" of small
+  // blobs.
+  MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithSmall, win,
+                            ScrollView::DARK_GREEN, pix);
+  MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall, win, ScrollView::WHITE,
+                            pix);
+  // Clear the grid of small blobs and insert the medium blobs.
+  Clear();
+  InsertBlobList(&blob_block->blobs);
+  MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithMedium, win,
+                            ScrollView::DARK_GREEN, pix);
+  // Clear again before we start deleting the blobs in the grid.
+  Clear();
+  MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1, win, ScrollView::CORAL, pix);
+  MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1, win, ScrollView::GOLDENROD, pix);
+  MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1, win, ScrollView::WHITE, pix);
+  if (debug) {
+#ifndef GRAPHICS_DISABLED
+    win->Update();
+#endif // !GRAPHICS_DISABLED
+    pixWrite("junkccphotomask.png", pix, IFF_PNG);
+#ifndef GRAPHICS_DISABLED
+    win->AwaitEvent(SVET_DESTROY);
+    delete win;
+#endif // !GRAPHICS_DISABLED
+  }
+  return pix;
+}
+
+// Computes and returns the noise_density IntGrid, at the same gridsize as
+// this by summing the number of small elements in a 3x3 neighbourhood of
+// each grid cell. good_grid is filled with blobs that are considered most
+// likely good text, and this is filled with small and medium blobs that are
+// more likely non-text.
+// The photo_map is used to bias the decision towards non-text, rather than
+// supplying definite decision.
+IntGrid *CCNonTextDetect::ComputeNoiseDensity(bool debug, Image photo_map, BlobGrid *good_grid) {
+  IntGrid *noise_counts = CountCellElements();
+  IntGrid *noise_density = noise_counts->NeighbourhoodSum();
+  IntGrid *good_counts = good_grid->CountCellElements();
+  // Now increase noise density in photo areas, to bias the decision and
+  // minimize hallucinated text on image, but trim the noise_density where
+  // there are good blobs and the original count is low in non-photo areas,
+  // indicating that most of the result came from neighbouring cells.
+  int height = pixGetHeight(photo_map);
+  int photo_offset = IntCastRounded(max_noise_count_ * kPhotoOffsetFraction);
+  for (int y = 0; y < gridheight(); ++y) {
+    for (int x = 0; x < gridwidth(); ++x) {
+      int noise = noise_density->GridCellValue(x, y);
+      if (max_noise_count_ < noise + photo_offset && noise <= max_noise_count_) {
+        // Test for photo.
+        int left = x * gridsize();
+        int right = left + gridsize();
+        int bottom = height - y * gridsize();
+        int top = bottom - gridsize();
+        if (ImageFind::BoundsWithinRect(photo_map, &left, &top, &right, &bottom)) {
+          noise_density->SetGridCell(x, y, noise + photo_offset);
+        }
+      }
+      if (debug && noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0) {
+        tprintf("At %d, %d, noise = %d, good=%d, orig=%d, thr=%d\n", x * gridsize(), y * gridsize(),
+                noise_density->GridCellValue(x, y), good_counts->GridCellValue(x, y),
+                noise_counts->GridCellValue(x, y), max_noise_count_);
+      }
+      if (noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0 &&
+          noise_counts->GridCellValue(x, y) * kOriginalNoiseMultiple <= max_noise_count_) {
+        noise_density->SetGridCell(x, y, 0);
+      }
+    }
+  }
+  delete noise_counts;
+  delete good_counts;
+  return noise_density;
+}
+
+// Helper to expand a box in one of the 4 directions by the given pad,
+// provided it does not expand into any cell with a zero noise density.
+// If that is not possible, try expanding all round by a small constant.
+static TBOX AttemptBoxExpansion(const TBOX &box, const IntGrid &noise_density, int pad) {
+  TBOX expanded_box(box);
+  expanded_box.set_right(box.right() + pad);
+  if (!noise_density.AnyZeroInRect(expanded_box)) {
+    return expanded_box;
+  }
+  expanded_box = box;
+  expanded_box.set_left(box.left() - pad);
+  if (!noise_density.AnyZeroInRect(expanded_box)) {
+    return expanded_box;
+  }
+  expanded_box = box;
+  expanded_box.set_top(box.top() + pad);
+  if (!noise_density.AnyZeroInRect(expanded_box)) {
+    return expanded_box;
+  }
+  expanded_box = box;
+  expanded_box.set_bottom(box.bottom() + pad);
+  if (!noise_density.AnyZeroInRect(expanded_box)) {
+    return expanded_box;
+  }
+  expanded_box = box;
+  expanded_box.pad(kNoisePadding, kNoisePadding);
+  if (!noise_density.AnyZeroInRect(expanded_box)) {
+    return expanded_box;
+  }
+  return box;
+}
+
+// Tests each blob in the list to see if it is certain non-text using 2
+// conditions:
+// 1. blob overlaps a cell with high value in noise_density_ (previously set
+// by ComputeNoiseDensity).
+// OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This
+// condition is disabled with max_blob_overlaps == -1.
+// If it does, the blob is declared non-text, and is used to mark up the
+// nontext_mask. Such blobs are fully deleted, and non-noise blobs have their
+// neighbours reset, as they may now point to deleted data.
+// WARNING: The blobs list blobs may be in the *this grid, but they are
+// not removed. If any deleted blobs might be in *this, then this must be
+// Clear()ed immediately after MarkAndDeleteNonTextBlobs is called.
+// If the win is not nullptr, deleted blobs are drawn on it in red, and kept
+// blobs are drawn on it in ok_color.
+void CCNonTextDetect::MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST *blobs, int max_blob_overlaps,
+                                                ScrollView *win, ScrollView::Color ok_color,
+                                                Image nontext_mask) {
+  int imageheight = tright().y() - bleft().x();
+  BLOBNBOX_IT blob_it(blobs);
+  BLOBNBOX_LIST dead_blobs;
+  BLOBNBOX_IT dead_it(&dead_blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX *blob = blob_it.data();
+    TBOX box = blob->bounding_box();
+    if (!noise_density_->RectMostlyOverThreshold(box, max_noise_count_) &&
+        (max_blob_overlaps < 0 || !BlobOverlapsTooMuch(blob, max_blob_overlaps))) {
+      blob->ClearNeighbours();
+#ifndef GRAPHICS_DISABLED
+      if (win != nullptr) {
+        blob->plot(win, ok_color, ok_color);
+      }
+#endif // !GRAPHICS_DISABLED
+    } else {
+      if (noise_density_->AnyZeroInRect(box)) {
+        // There is a danger that the bounding box may overlap real text, so
+        // we need to render the outline.
+        Image blob_pix = blob->cblob()->render_outline();
+        pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(),
+                    PIX_SRC | PIX_DST, blob_pix, 0, 0);
+        blob_pix.destroy();
+      } else {
+        if (box.area() < gridsize() * gridsize()) {
+          // It is a really bad idea to make lots of small components in the
+          // photo mask, so try to join it to a bigger area by expanding the
+          // box in a way that does not touch any zero noise density cell.
+          box = AttemptBoxExpansion(box, *noise_density_, gridsize());
+        }
+        // All overlapped cells are non-zero, so just mark the rectangle.
+        pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(),
+                    PIX_SET, nullptr, 0, 0);
+      }
+#ifndef GRAPHICS_DISABLED
+      if (win != nullptr) {
+        blob->plot(win, ScrollView::RED, ScrollView::RED);
+      }
+#endif // !GRAPHICS_DISABLED
+      // It is safe to delete the cblob now, as it isn't used by the grid
+      // or BlobOverlapsTooMuch, and the BLOBNBOXes will go away with the
+      // dead_blobs list.
+      // TODO: remove next line, currently still needed for resultiterator_test.
+      delete blob->remove_cblob();
+      dead_it.add_to_end(blob_it.extract());
+    }
+  }
+}
+
+// Returns true if the given blob overlaps more than max_overlaps blobs
+// in the current grid.
+bool CCNonTextDetect::BlobOverlapsTooMuch(BLOBNBOX *blob, int max_overlaps) {
+  // Search the grid to see what intersects it.
+  // Setup a Rectangle search for overlapping this blob.
+  BlobGridSearch rsearch(this);
+  const TBOX &box = blob->bounding_box();
+  rsearch.StartRectSearch(box);
+  rsearch.SetUniqueMode(true);
+  BLOBNBOX *neighbour;
+  int overlap_count = 0;
+  while (overlap_count <= max_overlaps && (neighbour = rsearch.NextRectSearch()) != nullptr) {
+    if (box.major_overlap(neighbour->bounding_box())) {
+      ++overlap_count;
+      if (overlap_count > max_overlaps) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+} // namespace tesseract.