Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/imagefind.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: imagefind.cpp | |
| 3 // Description: Function to find image and drawing regions in an image | |
| 4 // and create a corresponding list of empty blobs. | |
| 5 // Author: Ray Smith | |
| 6 // | |
| 7 // (C) Copyright 2008, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "imagefind.h" | |
| 25 | |
| 26 #include "colpartitiongrid.h" | |
| 27 #include "linlsq.h" | |
| 28 #include "params.h" | |
| 29 #include "statistc.h" | |
| 30 | |
| 31 #include <allheaders.h> | |
| 32 | |
| 33 #include <algorithm> | |
| 34 | |
| 35 namespace tesseract { | |
| 36 | |
| 37 static INT_VAR(textord_tabfind_show_images, false, "Show image blobs"); | |
| 38 | |
| 39 // Fraction of width or height of on pixels that can be discarded from a | |
| 40 // roughly rectangular image. | |
| 41 const double kMinRectangularFraction = 0.125; | |
| 42 // Fraction of width or height to consider image completely used. | |
| 43 const double kMaxRectangularFraction = 0.75; | |
| 44 // Fraction of width or height to allow transition from kMinRectangularFraction | |
| 45 // to kMaxRectangularFraction, equivalent to a dy/dx skew. | |
| 46 const double kMaxRectangularGradient = 0.1; // About 6 degrees. | |
| 47 // Minimum image size to be worth looking for images on. | |
| 48 const int kMinImageFindSize = 100; | |
| 49 // Pixel padding for noise blobs and partitions when rendering on the image | |
| 50 // mask to encourage them to join together. Make it too big and images | |
| 51 // will fatten out too much and have to be clipped to text. | |
| 52 const int kNoisePadding = 4; | |
| 53 | |
| 54 // Scans horizontally on x=[x_start,x_end), starting with y=*y_start, | |
| 55 // stepping y+=y_step, until y=y_end. *ystart is input/output. | |
| 56 // If the number of black pixels in a row, pix_count fits this pattern: | |
| 57 // 0 or more rows with pix_count < min_count then | |
| 58 // <= mid_width rows with min_count <= pix_count <= max_count then | |
| 59 // a row with pix_count > max_count then | |
| 60 // true is returned, and *y_start = the first y with pix_count >= min_count. | |
| 61 static bool HScanForEdge(uint32_t *data, int wpl, int x_start, int x_end, int min_count, | |
| 62 int mid_width, int max_count, int y_end, int y_step, int *y_start) { | |
| 63 int mid_rows = 0; | |
| 64 for (int y = *y_start; y != y_end; y += y_step) { | |
| 65 // Need pixCountPixelsInRow(pix, y, &pix_count, nullptr) to count in a | |
| 66 // subset. | |
| 67 int pix_count = 0; | |
| 68 uint32_t *line = data + wpl * y; | |
| 69 for (int x = x_start; x < x_end; ++x) { | |
| 70 if (GET_DATA_BIT(line, x)) { | |
| 71 ++pix_count; | |
| 72 } | |
| 73 } | |
| 74 if (mid_rows == 0 && pix_count < min_count) { | |
| 75 continue; // In the min phase. | |
| 76 } | |
| 77 if (mid_rows == 0) { | |
| 78 *y_start = y; // Save the y_start where we came out of the min phase. | |
| 79 } | |
| 80 if (pix_count > max_count) { | |
| 81 return true; // Found the pattern. | |
| 82 } | |
| 83 ++mid_rows; | |
| 84 if (mid_rows > mid_width) { | |
| 85 break; // Middle too big. | |
| 86 } | |
| 87 } | |
| 88 return false; // Never found max_count. | |
| 89 } | |
| 90 | |
| 91 // Scans vertically on y=[y_start,y_end), starting with x=*x_start, | |
| 92 // stepping x+=x_step, until x=x_end. *x_start is input/output. | |
| 93 // If the number of black pixels in a column, pix_count fits this pattern: | |
| 94 // 0 or more cols with pix_count < min_count then | |
| 95 // <= mid_width cols with min_count <= pix_count <= max_count then | |
| 96 // a column with pix_count > max_count then | |
| 97 // true is returned, and *x_start = the first x with pix_count >= min_count. | |
| 98 static bool VScanForEdge(uint32_t *data, int wpl, int y_start, int y_end, int min_count, | |
| 99 int mid_width, int max_count, int x_end, int x_step, int *x_start) { | |
| 100 int mid_cols = 0; | |
| 101 for (int x = *x_start; x != x_end; x += x_step) { | |
| 102 int pix_count = 0; | |
| 103 uint32_t *line = data + y_start * wpl; | |
| 104 for (int y = y_start; y < y_end; ++y, line += wpl) { | |
| 105 if (GET_DATA_BIT(line, x)) { | |
| 106 ++pix_count; | |
| 107 } | |
| 108 } | |
| 109 if (mid_cols == 0 && pix_count < min_count) { | |
| 110 continue; // In the min phase. | |
| 111 } | |
| 112 if (mid_cols == 0) { | |
| 113 *x_start = x; // Save the place where we came out of the min phase. | |
| 114 } | |
| 115 if (pix_count > max_count) { | |
| 116 return true; // found the pattern. | |
| 117 } | |
| 118 ++mid_cols; | |
| 119 if (mid_cols > mid_width) { | |
| 120 break; // Middle too big. | |
| 121 } | |
| 122 } | |
| 123 return false; // Never found max_count. | |
| 124 } | |
| 125 | |
| 126 // Returns true if there is a rectangle in the source pix, such that all | |
| 127 // pixel rows and column slices outside of it have less than | |
| 128 // min_fraction of the pixels black, and within max_skew_gradient fraction | |
| 129 // of the pixels on the inside, there are at least max_fraction of the | |
| 130 // pixels black. In other words, the inside of the rectangle looks roughly | |
| 131 // rectangular, and the outside of it looks like extra bits. | |
| 132 // On return, the rectangle is defined by x_start, y_start, x_end and y_end. | |
| 133 // Note: the algorithm is iterative, allowing it to slice off pixels from | |
| 134 // one edge, allowing it to then slice off more pixels from another edge. | |
| 135 static bool pixNearlyRectangular(Image pix, double min_fraction, double max_fraction, | |
| 136 double max_skew_gradient, int *x_start, int *y_start, | |
| 137 int *x_end, int *y_end) { | |
| 138 ASSERT_HOST(pix != nullptr); | |
| 139 *x_start = 0; | |
| 140 *x_end = pixGetWidth(pix); | |
| 141 *y_start = 0; | |
| 142 *y_end = pixGetHeight(pix); | |
| 143 | |
| 144 uint32_t *data = pixGetData(pix); | |
| 145 int wpl = pixGetWpl(pix); | |
| 146 bool any_cut = false; | |
| 147 bool left_done = false; | |
| 148 bool right_done = false; | |
| 149 bool top_done = false; | |
| 150 bool bottom_done = false; | |
| 151 do { | |
| 152 any_cut = false; | |
| 153 // Find the top/bottom edges. | |
| 154 int width = *x_end - *x_start; | |
| 155 int min_count = static_cast<int>(width * min_fraction); | |
| 156 int max_count = static_cast<int>(width * max_fraction); | |
| 157 int edge_width = static_cast<int>(width * max_skew_gradient); | |
| 158 if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, max_count, *y_end, 1, | |
| 159 y_start) && | |
| 160 !top_done) { | |
| 161 top_done = true; | |
| 162 any_cut = true; | |
| 163 } | |
| 164 --(*y_end); | |
| 165 if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, max_count, *y_start, -1, | |
| 166 y_end) && | |
| 167 !bottom_done) { | |
| 168 bottom_done = true; | |
| 169 any_cut = true; | |
| 170 } | |
| 171 ++(*y_end); | |
| 172 | |
| 173 // Find the left/right edges. | |
| 174 int height = *y_end - *y_start; | |
| 175 min_count = static_cast<int>(height * min_fraction); | |
| 176 max_count = static_cast<int>(height * max_fraction); | |
| 177 edge_width = static_cast<int>(height * max_skew_gradient); | |
| 178 if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, max_count, *x_end, 1, | |
| 179 x_start) && | |
| 180 !left_done) { | |
| 181 left_done = true; | |
| 182 any_cut = true; | |
| 183 } | |
| 184 --(*x_end); | |
| 185 if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, max_count, *x_start, -1, | |
| 186 x_end) && | |
| 187 !right_done) { | |
| 188 right_done = true; | |
| 189 any_cut = true; | |
| 190 } | |
| 191 ++(*x_end); | |
| 192 } while (any_cut); | |
| 193 | |
| 194 // All edges must satisfy the condition of sharp gradient in pixel density | |
| 195 // in order for the full rectangle to be present. | |
| 196 return left_done && right_done && top_done && bottom_done; | |
| 197 } | |
| 198 | |
| 199 // Generates a Boxa, Pixa pair from the input binary (image mask) pix, | |
| 200 // analogous to pixConnComp, except that connected components which are nearly | |
| 201 // rectangular are replaced with solid rectangles. | |
| 202 // The returned boxa, pixa may be nullptr, meaning no images found. | |
| 203 // If not nullptr, they must be destroyed by the caller. | |
| 204 // Resolution of pix should match the source image (Tesseract::pix_binary_) | |
| 205 // so the output coordinate systems match. | |
| 206 static void ConnCompAndRectangularize(Image pix, DebugPixa *pixa_debug, Boxa **boxa, | |
| 207 Pixa **pixa) { | |
| 208 *boxa = nullptr; | |
| 209 *pixa = nullptr; | |
| 210 | |
| 211 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 212 pixa_debug->AddPix(pix, "Conncompimage"); | |
| 213 } | |
| 214 // Find the individual image regions in the mask image. | |
| 215 *boxa = pixConnComp(pix, pixa, 8); | |
| 216 // Rectangularize the individual images. If a sharp edge in vertical and/or | |
| 217 // horizontal occupancy can be found, it indicates a probably rectangular | |
| 218 // image with unwanted bits merged on, so clip to the approximate rectangle. | |
| 219 int npixes = 0; | |
| 220 if (*boxa != nullptr && *pixa != nullptr) { | |
| 221 npixes = pixaGetCount(*pixa); | |
| 222 } | |
| 223 for (int i = 0; i < npixes; ++i) { | |
| 224 int x_start, x_end, y_start, y_end; | |
| 225 Image img_pix = pixaGetPix(*pixa, i, L_CLONE); | |
| 226 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 227 pixa_debug->AddPix(img_pix, "A component"); | |
| 228 } | |
| 229 if (pixNearlyRectangular(img_pix, kMinRectangularFraction, kMaxRectangularFraction, | |
| 230 kMaxRectangularGradient, &x_start, &y_start, &x_end, &y_end)) { | |
| 231 Image simple_pix = pixCreate(x_end - x_start, y_end - y_start, 1); | |
| 232 pixSetAll(simple_pix); | |
| 233 img_pix.destroy(); | |
| 234 // pixaReplacePix takes ownership of the simple_pix. | |
| 235 pixaReplacePix(*pixa, i, simple_pix, nullptr); | |
| 236 img_pix = pixaGetPix(*pixa, i, L_CLONE); | |
| 237 // Fix the box to match the new pix. | |
| 238 l_int32 x, y, width, height; | |
| 239 boxaGetBoxGeometry(*boxa, i, &x, &y, &width, &height); | |
| 240 Box *simple_box = boxCreate(x + x_start, y + y_start, x_end - x_start, y_end - y_start); | |
| 241 boxaReplaceBox(*boxa, i, simple_box); | |
| 242 } | |
| 243 img_pix.destroy(); | |
| 244 } | |
| 245 } | |
| 246 | |
| 247 // Finds image regions within the BINARY source pix (page image) and returns | |
| 248 // the image regions as a mask image. | |
| 249 // The returned pix may be nullptr, meaning no images found. | |
| 250 // If not nullptr, it must be PixDestroyed by the caller. | |
| 251 // If textord_tabfind_show_images, debug images are appended to pixa_debug. | |
| 252 Image ImageFind::FindImages(Image pix, DebugPixa *pixa_debug) { | |
| 253 auto width = pixGetWidth(pix); | |
| 254 auto height = pixGetHeight(pix); | |
| 255 // Not worth looking at small images. | |
| 256 // Leptonica will print an error message and return nullptr if we call | |
| 257 // pixGenHalftoneMask(pixr, nullptr, ...) with width or height < 100 | |
| 258 // for the reduced image, so we want to bypass that, too. | |
| 259 if (width / 2 < kMinImageFindSize || height / 2 < kMinImageFindSize) { | |
| 260 return pixCreate(width, height, 1); | |
| 261 } | |
| 262 | |
| 263 // Reduce by factor 2. | |
| 264 Image pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0); | |
| 265 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 266 pixa_debug->AddPix(pixr, "CascadeReduced"); | |
| 267 } | |
| 268 | |
| 269 // Get the halftone mask directly from Leptonica. | |
| 270 l_int32 ht_found = 0; | |
| 271 Pixa *pixadb = (textord_tabfind_show_images && pixa_debug != nullptr) ? pixaCreate(0) : nullptr; | |
| 272 Image pixht2 = pixGenerateHalftoneMask(pixr, nullptr, &ht_found, pixadb); | |
| 273 if (pixadb) { | |
| 274 Image pixdb = pixaDisplayTiledInColumns(pixadb, 3, 1.0, 20, 2); | |
| 275 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 276 pixa_debug->AddPix(pixdb, "HalftoneMask"); | |
| 277 } | |
| 278 pixdb.destroy(); | |
| 279 pixaDestroy(&pixadb); | |
| 280 } | |
| 281 pixr.destroy(); | |
| 282 if (!ht_found && pixht2 != nullptr) { | |
| 283 pixht2.destroy(); | |
| 284 } | |
| 285 if (pixht2 == nullptr) { | |
| 286 return pixCreate(width, height, 1); | |
| 287 } | |
| 288 | |
| 289 // Expand back up again. | |
| 290 Image pixht = pixExpandReplicate(pixht2, 2); | |
| 291 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 292 pixa_debug->AddPix(pixht, "HalftoneReplicated"); | |
| 293 } | |
| 294 pixht2.destroy(); | |
| 295 | |
| 296 // Fill to capture pixels near the mask edges that were missed | |
| 297 Image pixt = pixSeedfillBinary(nullptr, pixht, pix, 8); | |
| 298 pixht |= pixt; | |
| 299 pixt.destroy(); | |
| 300 | |
| 301 // Eliminate lines and bars that may be joined to images. | |
| 302 Image pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3); | |
| 303 pixDilateBrick(pixfinemask, pixfinemask, 5, 5); | |
| 304 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 305 pixa_debug->AddPix(pixfinemask, "FineMask"); | |
| 306 } | |
| 307 Image pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1); | |
| 308 Image pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0); | |
| 309 pixreduced.destroy(); | |
| 310 pixDilateBrick(pixreduced2, pixreduced2, 5, 5); | |
| 311 Image pixcoarsemask = pixExpandReplicate(pixreduced2, 8); | |
| 312 pixreduced2.destroy(); | |
| 313 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 314 pixa_debug->AddPix(pixcoarsemask, "CoarseMask"); | |
| 315 } | |
| 316 // Combine the coarse and fine image masks. | |
| 317 pixcoarsemask &= pixfinemask; | |
| 318 pixfinemask.destroy(); | |
| 319 // Dilate a bit to make sure we get everything. | |
| 320 pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3); | |
| 321 Image pixmask = pixExpandReplicate(pixcoarsemask, 16); | |
| 322 pixcoarsemask.destroy(); | |
| 323 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 324 pixa_debug->AddPix(pixmask, "MaskDilated"); | |
| 325 } | |
| 326 // And the image mask with the line and bar remover. | |
| 327 pixht &= pixmask; | |
| 328 pixmask.destroy(); | |
| 329 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 330 pixa_debug->AddPix(pixht, "FinalMask"); | |
| 331 } | |
| 332 // Make the result image the same size as the input. | |
| 333 Image result = pixCreate(width, height, 1); | |
| 334 result |= pixht; | |
| 335 pixht.destroy(); | |
| 336 return result; | |
| 337 } | |
| 338 | |
| 339 // Given an input pix, and a bounding rectangle, the sides of the rectangle | |
| 340 // are shrunk inwards until they bound any black pixels found within the | |
| 341 // original rectangle. Returns false if the rectangle contains no black | |
| 342 // pixels at all. | |
| 343 bool ImageFind::BoundsWithinRect(Image pix, int *x_start, int *y_start, int *x_end, int *y_end) { | |
| 344 Box *input_box = boxCreate(*x_start, *y_start, *x_end - *x_start, *y_end - *y_start); | |
| 345 Box *output_box = nullptr; | |
| 346 pixClipBoxToForeground(pix, input_box, nullptr, &output_box); | |
| 347 bool result = output_box != nullptr; | |
| 348 if (result) { | |
| 349 l_int32 x, y, width, height; | |
| 350 boxGetGeometry(output_box, &x, &y, &width, &height); | |
| 351 *x_start = x; | |
| 352 *y_start = y; | |
| 353 *x_end = x + width; | |
| 354 *y_end = y + height; | |
| 355 boxDestroy(&output_box); | |
| 356 } | |
| 357 boxDestroy(&input_box); | |
| 358 return result; | |
| 359 } | |
| 360 | |
| 361 // Given a point in 3-D (RGB) space, returns the squared Euclidean distance | |
| 362 // of the point from the given line, defined by a pair of points in the 3-D | |
| 363 // (RGB) space, line1 and line2. | |
| 364 double ImageFind::ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2, | |
| 365 const uint8_t *point) { | |
| 366 int line_vector[kRGBRMSColors]; | |
| 367 int point_vector[kRGBRMSColors]; | |
| 368 for (int i = 0; i < kRGBRMSColors; ++i) { | |
| 369 line_vector[i] = static_cast<int>(line2[i]) - static_cast<int>(line1[i]); | |
| 370 point_vector[i] = static_cast<int>(point[i]) - static_cast<int>(line1[i]); | |
| 371 } | |
| 372 line_vector[L_ALPHA_CHANNEL] = 0; | |
| 373 // Now the cross product in 3d. | |
| 374 int cross[kRGBRMSColors]; | |
| 375 cross[COLOR_RED] = line_vector[COLOR_GREEN] * point_vector[COLOR_BLUE] - | |
| 376 line_vector[COLOR_BLUE] * point_vector[COLOR_GREEN]; | |
| 377 cross[COLOR_GREEN] = line_vector[COLOR_BLUE] * point_vector[COLOR_RED] - | |
| 378 line_vector[COLOR_RED] * point_vector[COLOR_BLUE]; | |
| 379 cross[COLOR_BLUE] = line_vector[COLOR_RED] * point_vector[COLOR_GREEN] - | |
| 380 line_vector[COLOR_GREEN] * point_vector[COLOR_RED]; | |
| 381 cross[L_ALPHA_CHANNEL] = 0; | |
| 382 // Now the sums of the squares. | |
| 383 double cross_sq = 0.0; | |
| 384 double line_sq = 0.0; | |
| 385 for (int j = 0; j < kRGBRMSColors; ++j) { | |
| 386 cross_sq += static_cast<double>(cross[j]) * cross[j]; | |
| 387 line_sq += static_cast<double>(line_vector[j]) * line_vector[j]; | |
| 388 } | |
| 389 if (line_sq == 0.0) { | |
| 390 return 0.0; | |
| 391 } | |
| 392 return cross_sq / line_sq; // This is the squared distance. | |
| 393 } | |
| 394 | |
| 395 // ================ CUTTING POLYGONAL IMAGES FROM A RECTANGLE ================ | |
| 396 // The following functions are responsible for cutting a polygonal image from | |
| 397 // a rectangle: CountPixelsInRotatedBox, AttemptToShrinkBox, CutChunkFromParts | |
| 398 // with DivideImageIntoParts as the master. | |
| 399 // Problem statement: | |
| 400 // We start with a single connected component from the image mask: we get | |
| 401 // a Pix of the component, and its location on the page (im_box). | |
| 402 // The objective of cutting a polygonal image from its rectangle is to avoid | |
| 403 // interfering text, but not text that completely overlaps the image. | |
| 404 // ------------------------------ ------------------------------ | |
| 405 // | Single input partition | | 1 Cut up output partitions | | |
| 406 // | | ------------------------------ | |
| 407 // Av|oid | Avoid | | | |
| 408 // | | |________________________| | |
| 409 // Int|erfering | Interfering | | | |
| 410 // | | _____|__________________| | |
| 411 // T|ext | Text | | | |
| 412 // | Text-on-image | | Text-on-image | | |
| 413 // ------------------------------ -------------------------- | |
| 414 // DivideImageIntoParts does this by building a ColPartition_LIST (not in the | |
| 415 // grid) with each ColPartition representing one of the rectangles needed, | |
| 416 // starting with a single rectangle for the whole image component, and cutting | |
| 417 // bits out of it with CutChunkFromParts as needed to avoid text. The output | |
| 418 // ColPartitions are supposed to be ordered from top to bottom. | |
| 419 | |
| 420 // The problem is complicated by the fact that we have rotated the coordinate | |
| 421 // system to make text lines horizontal, so if we need to look at the component | |
| 422 // image, we have to rotate the coordinates. Throughout the functions in this | |
| 423 // section im_box is the rectangle representing the image component in the | |
| 424 // rotated page coordinates (where we are building our output ColPartitions), | |
| 425 // rotation is the rotation that we used to get there, and rerotation is the | |
| 426 // rotation required to get back to original page image coordinates. | |
| 427 // To get to coordinates in the component image, pix, we rotate the im_box, | |
| 428 // the point we want to locate, and subtract the rotated point from the top-left | |
| 429 // of the rotated im_box. | |
| 430 // im_box is therefore essential to calculating coordinates within the pix. | |
| 431 | |
| 432 // Returns true if there are no black pixels in between the boxes. | |
| 433 // The im_box must represent the bounding box of the pix in tesseract | |
| 434 // coordinates, which may be negative, due to rotations to make the textlines | |
| 435 // horizontal. The boxes are rotated by rotation, which should undo such | |
| 436 // rotations, before mapping them onto the pix. | |
| 437 bool ImageFind::BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, | |
| 438 const FCOORD &rotation, Image pix) { | |
| 439 TBOX search_box(box1); | |
| 440 search_box += box2; | |
| 441 if (box1.x_gap(box2) >= box1.y_gap(box2)) { | |
| 442 if (box1.x_gap(box2) <= 0) { | |
| 443 return true; | |
| 444 } | |
| 445 search_box.set_left(std::min(box1.right(), box2.right())); | |
| 446 search_box.set_right(std::max(box1.left(), box2.left())); | |
| 447 } else { | |
| 448 if (box1.y_gap(box2) <= 0) { | |
| 449 return true; | |
| 450 } | |
| 451 search_box.set_top(std::max(box1.bottom(), box2.bottom())); | |
| 452 search_box.set_bottom(std::min(box1.top(), box2.top())); | |
| 453 } | |
| 454 return CountPixelsInRotatedBox(search_box, im_box, rotation, pix) == 0; | |
| 455 } | |
| 456 | |
| 457 // Returns the number of pixels in box in the pix. | |
| 458 // rotation, pix and im_box are defined in the large comment above. | |
| 459 int ImageFind::CountPixelsInRotatedBox(TBOX box, const TBOX &im_box, const FCOORD &rotation, | |
| 460 Image pix) { | |
| 461 // Intersect it with the image box. | |
| 462 box &= im_box; // This is in-place box intersection. | |
| 463 if (box.null_box()) { | |
| 464 return 0; | |
| 465 } | |
| 466 box.rotate(rotation); | |
| 467 TBOX rotated_im_box(im_box); | |
| 468 rotated_im_box.rotate(rotation); | |
| 469 Image rect_pix = pixCreate(box.width(), box.height(), 1); | |
| 470 pixRasterop(rect_pix, 0, 0, box.width(), box.height(), PIX_SRC, pix, | |
| 471 box.left() - rotated_im_box.left(), rotated_im_box.top() - box.top()); | |
| 472 l_int32 result; | |
| 473 pixCountPixels(rect_pix, &result, nullptr); | |
| 474 rect_pix.destroy(); | |
| 475 return result; | |
| 476 } | |
| 477 | |
| 478 // The box given by slice contains some black pixels, but not necessarily | |
| 479 // over the whole box. Shrink the x bounds of slice, but not the y bounds | |
| 480 // until there is at least one black pixel in the outermost columns. | |
| 481 // rotation, rerotation, pix and im_box are defined in the large comment above. | |
| 482 static void AttemptToShrinkBox(const FCOORD &rotation, const FCOORD &rerotation, const TBOX &im_box, | |
| 483 Image pix, TBOX *slice) { | |
| 484 TBOX rotated_box(*slice); | |
| 485 rotated_box.rotate(rerotation); | |
| 486 TBOX rotated_im_box(im_box); | |
| 487 rotated_im_box.rotate(rerotation); | |
| 488 int left = rotated_box.left() - rotated_im_box.left(); | |
| 489 int right = rotated_box.right() - rotated_im_box.left(); | |
| 490 int top = rotated_im_box.top() - rotated_box.top(); | |
| 491 int bottom = rotated_im_box.top() - rotated_box.bottom(); | |
| 492 ImageFind::BoundsWithinRect(pix, &left, &top, &right, &bottom); | |
| 493 top = rotated_im_box.top() - top; | |
| 494 bottom = rotated_im_box.top() - bottom; | |
| 495 left += rotated_im_box.left(); | |
| 496 right += rotated_im_box.left(); | |
| 497 rotated_box.set_to_given_coords(left, bottom, right, top); | |
| 498 rotated_box.rotate(rotation); | |
| 499 slice->set_left(rotated_box.left()); | |
| 500 slice->set_right(rotated_box.right()); | |
| 501 } | |
| 502 | |
| 503 // The meat of cutting a polygonal image around text. | |
| 504 // This function covers the general case of cutting a box out of a box | |
| 505 // as shown: | |
| 506 // Input Output | |
| 507 // ------------------------------ ------------------------------ | |
| 508 // | Single input partition | | 1 Cut up output partitions | | |
| 509 // | | ------------------------------ | |
| 510 // | ---------- | --------- ---------- | |
| 511 // | | box | | | 2 | box | 3 | | |
| 512 // | | | | | | is cut | | | |
| 513 // | ---------- | --------- out ---------- | |
| 514 // | | ------------------------------ | |
| 515 // | | | 4 | | |
| 516 // ------------------------------ ------------------------------ | |
| 517 // In the context that this function is used, at most 3 of the above output | |
| 518 // boxes will be created, as the overlapping box is never contained by the | |
| 519 // input. | |
| 520 // The above cutting operation is executed for each element of part_list that | |
| 521 // is overlapped by the input box. Each modified ColPartition is replaced | |
| 522 // in place in the list by the output of the cutting operation in the order | |
| 523 // shown above, so iff no holes are ever created, the output will be in | |
| 524 // top-to-bottom order, but in extreme cases, hole creation is possible. | |
| 525 // In such cases, the output order may cause strange block polygons. | |
| 526 // rotation, rerotation, pix and im_box are defined in the large comment above. | |
| 527 static void CutChunkFromParts(const TBOX &box, const TBOX &im_box, const FCOORD &rotation, | |
| 528 const FCOORD &rerotation, Image pix, ColPartition_LIST *part_list) { | |
| 529 ASSERT_HOST(!part_list->empty()); | |
| 530 ColPartition_IT part_it(part_list); | |
| 531 do { | |
| 532 ColPartition *part = part_it.data(); | |
| 533 TBOX part_box = part->bounding_box(); | |
| 534 if (part_box.overlap(box)) { | |
| 535 // This part must be cut and replaced with the remains. There are | |
| 536 // up to 4 pieces to be made. Start with the first one and use | |
| 537 // add_before_stay_put. For each piece if it has no black pixels | |
| 538 // left, just don't make the box. | |
| 539 // Above box. | |
| 540 if (box.top() < part_box.top()) { | |
| 541 TBOX slice(part_box); | |
| 542 slice.set_bottom(box.top()); | |
| 543 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) { | |
| 544 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); | |
| 545 part_it.add_before_stay_put( | |
| 546 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT)); | |
| 547 } | |
| 548 } | |
| 549 // Left of box. | |
| 550 if (box.left() > part_box.left()) { | |
| 551 TBOX slice(part_box); | |
| 552 slice.set_right(box.left()); | |
| 553 if (box.top() < part_box.top()) { | |
| 554 slice.set_top(box.top()); | |
| 555 } | |
| 556 if (box.bottom() > part_box.bottom()) { | |
| 557 slice.set_bottom(box.bottom()); | |
| 558 } | |
| 559 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) { | |
| 560 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); | |
| 561 part_it.add_before_stay_put( | |
| 562 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT)); | |
| 563 } | |
| 564 } | |
| 565 // Right of box. | |
| 566 if (box.right() < part_box.right()) { | |
| 567 TBOX slice(part_box); | |
| 568 slice.set_left(box.right()); | |
| 569 if (box.top() < part_box.top()) { | |
| 570 slice.set_top(box.top()); | |
| 571 } | |
| 572 if (box.bottom() > part_box.bottom()) { | |
| 573 slice.set_bottom(box.bottom()); | |
| 574 } | |
| 575 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) { | |
| 576 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); | |
| 577 part_it.add_before_stay_put( | |
| 578 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT)); | |
| 579 } | |
| 580 } | |
| 581 // Below box. | |
| 582 if (box.bottom() > part_box.bottom()) { | |
| 583 TBOX slice(part_box); | |
| 584 slice.set_top(box.bottom()); | |
| 585 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) { | |
| 586 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); | |
| 587 part_it.add_before_stay_put( | |
| 588 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, BTFT_NONTEXT)); | |
| 589 } | |
| 590 } | |
| 591 part->DeleteBoxes(); | |
| 592 delete part_it.extract(); | |
| 593 } | |
| 594 part_it.forward(); | |
| 595 } while (!part_it.at_first()); | |
| 596 } | |
| 597 | |
| 598 // Starts with the bounding box of the image component and cuts it up | |
| 599 // so that it doesn't intersect text where possible. | |
| 600 // Strong fully contained horizontal text is marked as text on image, | |
| 601 // and does not cause a division of the image. | |
| 602 // For more detail see the large comment above on cutting polygonal images | |
| 603 // from a rectangle. | |
| 604 // rotation, rerotation, pix and im_box are defined in the large comment above. | |
| 605 static void DivideImageIntoParts(const TBOX &im_box, const FCOORD &rotation, | |
| 606 const FCOORD &rerotation, Image pix, | |
| 607 ColPartitionGridSearch *rectsearch, ColPartition_LIST *part_list) { | |
| 608 // Add the full im_box partition to the list to begin with. | |
| 609 ColPartition *pix_part = | |
| 610 ColPartition::FakePartition(im_box, PT_UNKNOWN, BRT_RECTIMAGE, BTFT_NONTEXT); | |
| 611 ColPartition_IT part_it(part_list); | |
| 612 part_it.add_after_then_move(pix_part); | |
| 613 | |
| 614 rectsearch->StartRectSearch(im_box); | |
| 615 ColPartition *part; | |
| 616 while ((part = rectsearch->NextRectSearch()) != nullptr) { | |
| 617 TBOX part_box = part->bounding_box(); | |
| 618 if (part_box.contains(im_box) && part->flow() >= BTFT_CHAIN) { | |
| 619 // This image is completely covered by an existing text partition. | |
| 620 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) { | |
| 621 ColPartition *pix_part = part_it.extract(); | |
| 622 pix_part->DeleteBoxes(); | |
| 623 delete pix_part; | |
| 624 } | |
| 625 } else if (part->flow() == BTFT_STRONG_CHAIN) { | |
| 626 // Text intersects the box. | |
| 627 TBOX overlap_box = part_box.intersection(im_box); | |
| 628 // Intersect it with the image box. | |
| 629 int black_area = ImageFind::CountPixelsInRotatedBox(overlap_box, im_box, rerotation, pix); | |
| 630 if (black_area * 2 < part_box.area() || !im_box.contains(part_box)) { | |
| 631 // Eat a piece out of the image. | |
| 632 // Pad it so that pieces eaten out look decent. | |
| 633 int padding = part->blob_type() == BRT_VERT_TEXT ? part_box.width() : part_box.height(); | |
| 634 part_box.set_top(part_box.top() + padding / 2); | |
| 635 part_box.set_bottom(part_box.bottom() - padding / 2); | |
| 636 CutChunkFromParts(part_box, im_box, rotation, rerotation, pix, part_list); | |
| 637 } else { | |
| 638 // Strong overlap with the black area, so call it text on image. | |
| 639 part->set_flow(BTFT_TEXT_ON_IMAGE); | |
| 640 } | |
| 641 } | |
| 642 if (part_list->empty()) { | |
| 643 break; | |
| 644 } | |
| 645 } | |
| 646 } | |
| 647 | |
| 648 // Search for the rightmost text that overlaps vertically and is to the left | |
| 649 // of the given box, but within the given left limit. | |
| 650 static int ExpandImageLeft(const TBOX &box, int left_limit, ColPartitionGrid *part_grid) { | |
| 651 ColPartitionGridSearch search(part_grid); | |
| 652 ColPartition *part; | |
| 653 // Search right to left for any text that overlaps. | |
| 654 search.StartSideSearch(box.left(), box.bottom(), box.top()); | |
| 655 while ((part = search.NextSideSearch(true)) != nullptr) { | |
| 656 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 657 const TBOX &part_box(part->bounding_box()); | |
| 658 if (part_box.y_gap(box) < 0) { | |
| 659 if (part_box.right() > left_limit && part_box.right() < box.left()) { | |
| 660 left_limit = part_box.right(); | |
| 661 } | |
| 662 break; | |
| 663 } | |
| 664 } | |
| 665 } | |
| 666 if (part != nullptr) { | |
| 667 // Search for the nearest text up to the one we already found. | |
| 668 TBOX search_box(left_limit, box.bottom(), box.left(), box.top()); | |
| 669 search.StartRectSearch(search_box); | |
| 670 while ((part = search.NextRectSearch()) != nullptr) { | |
| 671 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 672 const TBOX &part_box(part->bounding_box()); | |
| 673 if (part_box.y_gap(box) < 0) { | |
| 674 if (part_box.right() > left_limit && part_box.right() < box.left()) { | |
| 675 left_limit = part_box.right(); | |
| 676 } | |
| 677 } | |
| 678 } | |
| 679 } | |
| 680 } | |
| 681 return left_limit; | |
| 682 } | |
| 683 | |
| 684 // Search for the leftmost text that overlaps vertically and is to the right | |
| 685 // of the given box, but within the given right limit. | |
| 686 static int ExpandImageRight(const TBOX &box, int right_limit, ColPartitionGrid *part_grid) { | |
| 687 ColPartitionGridSearch search(part_grid); | |
| 688 ColPartition *part; | |
| 689 // Search left to right for any text that overlaps. | |
| 690 search.StartSideSearch(box.right(), box.bottom(), box.top()); | |
| 691 while ((part = search.NextSideSearch(false)) != nullptr) { | |
| 692 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 693 const TBOX &part_box(part->bounding_box()); | |
| 694 if (part_box.y_gap(box) < 0) { | |
| 695 if (part_box.left() < right_limit && part_box.left() > box.right()) { | |
| 696 right_limit = part_box.left(); | |
| 697 } | |
| 698 break; | |
| 699 } | |
| 700 } | |
| 701 } | |
| 702 if (part != nullptr) { | |
| 703 // Search for the nearest text up to the one we already found. | |
| 704 TBOX search_box(box.left(), box.bottom(), right_limit, box.top()); | |
| 705 search.StartRectSearch(search_box); | |
| 706 while ((part = search.NextRectSearch()) != nullptr) { | |
| 707 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 708 const TBOX &part_box(part->bounding_box()); | |
| 709 if (part_box.y_gap(box) < 0) { | |
| 710 if (part_box.left() < right_limit && part_box.left() > box.right()) { | |
| 711 right_limit = part_box.left(); | |
| 712 } | |
| 713 } | |
| 714 } | |
| 715 } | |
| 716 } | |
| 717 return right_limit; | |
| 718 } | |
| 719 | |
| 720 // Search for the topmost text that overlaps horizontally and is below | |
| 721 // the given box, but within the given bottom limit. | |
| 722 static int ExpandImageBottom(const TBOX &box, int bottom_limit, ColPartitionGrid *part_grid) { | |
| 723 ColPartitionGridSearch search(part_grid); | |
| 724 ColPartition *part; | |
| 725 // Search right to left for any text that overlaps. | |
| 726 search.StartVerticalSearch(box.left(), box.right(), box.bottom()); | |
| 727 while ((part = search.NextVerticalSearch(true)) != nullptr) { | |
| 728 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 729 const TBOX &part_box(part->bounding_box()); | |
| 730 if (part_box.x_gap(box) < 0) { | |
| 731 if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) { | |
| 732 bottom_limit = part_box.top(); | |
| 733 } | |
| 734 break; | |
| 735 } | |
| 736 } | |
| 737 } | |
| 738 if (part != nullptr) { | |
| 739 // Search for the nearest text up to the one we already found. | |
| 740 TBOX search_box(box.left(), bottom_limit, box.right(), box.bottom()); | |
| 741 search.StartRectSearch(search_box); | |
| 742 while ((part = search.NextRectSearch()) != nullptr) { | |
| 743 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 744 const TBOX &part_box(part->bounding_box()); | |
| 745 if (part_box.x_gap(box) < 0) { | |
| 746 if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) { | |
| 747 bottom_limit = part_box.top(); | |
| 748 } | |
| 749 } | |
| 750 } | |
| 751 } | |
| 752 } | |
| 753 return bottom_limit; | |
| 754 } | |
| 755 | |
| 756 // Search for the bottommost text that overlaps horizontally and is above | |
| 757 // the given box, but within the given top limit. | |
| 758 static int ExpandImageTop(const TBOX &box, int top_limit, ColPartitionGrid *part_grid) { | |
| 759 ColPartitionGridSearch search(part_grid); | |
| 760 ColPartition *part; | |
| 761 // Search right to left for any text that overlaps. | |
| 762 search.StartVerticalSearch(box.left(), box.right(), box.top()); | |
| 763 while ((part = search.NextVerticalSearch(false)) != nullptr) { | |
| 764 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 765 const TBOX &part_box(part->bounding_box()); | |
| 766 if (part_box.x_gap(box) < 0) { | |
| 767 if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) { | |
| 768 top_limit = part_box.bottom(); | |
| 769 } | |
| 770 break; | |
| 771 } | |
| 772 } | |
| 773 } | |
| 774 if (part != nullptr) { | |
| 775 // Search for the nearest text up to the one we already found. | |
| 776 TBOX search_box(box.left(), box.top(), box.right(), top_limit); | |
| 777 search.StartRectSearch(search_box); | |
| 778 while ((part = search.NextRectSearch()) != nullptr) { | |
| 779 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { | |
| 780 const TBOX &part_box(part->bounding_box()); | |
| 781 if (part_box.x_gap(box) < 0) { | |
| 782 if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) { | |
| 783 top_limit = part_box.bottom(); | |
| 784 } | |
| 785 } | |
| 786 } | |
| 787 } | |
| 788 } | |
| 789 return top_limit; | |
| 790 } | |
| 791 | |
| 792 // Expands the image box in the given direction until it hits text, | |
| 793 // limiting the expansion to the given limit box, returning the result | |
| 794 // in the expanded box, and | |
| 795 // returning the increase in area resulting from the expansion. | |
| 796 static int ExpandImageDir(BlobNeighbourDir dir, const TBOX &im_box, const TBOX &limit_box, | |
| 797 ColPartitionGrid *part_grid, TBOX *expanded_box) { | |
| 798 *expanded_box = im_box; | |
| 799 switch (dir) { | |
| 800 case BND_LEFT: | |
| 801 expanded_box->set_left(ExpandImageLeft(im_box, limit_box.left(), part_grid)); | |
| 802 break; | |
| 803 case BND_RIGHT: | |
| 804 expanded_box->set_right(ExpandImageRight(im_box, limit_box.right(), part_grid)); | |
| 805 break; | |
| 806 case BND_ABOVE: | |
| 807 expanded_box->set_top(ExpandImageTop(im_box, limit_box.top(), part_grid)); | |
| 808 break; | |
| 809 case BND_BELOW: | |
| 810 expanded_box->set_bottom(ExpandImageBottom(im_box, limit_box.bottom(), part_grid)); | |
| 811 break; | |
| 812 default: | |
| 813 return 0; | |
| 814 } | |
| 815 return expanded_box->area() - im_box.area(); | |
| 816 } | |
| 817 | |
| 818 // Expands the image partition into any non-text until it touches text. | |
| 819 // The expansion proceeds in the order of increasing increase in area | |
| 820 // as a heuristic to find the best rectangle by expanding in the most | |
| 821 // constrained direction first. | |
| 822 static void MaximalImageBoundingBox(ColPartitionGrid *part_grid, TBOX *im_box) { | |
| 823 bool dunnit[BND_COUNT]; | |
| 824 memset(dunnit, 0, sizeof(dunnit)); | |
| 825 TBOX limit_box(part_grid->bleft().x(), part_grid->bleft().y(), part_grid->tright().x(), | |
| 826 part_grid->tright().y()); | |
| 827 TBOX text_box(*im_box); | |
| 828 for (int iteration = 0; iteration < BND_COUNT; ++iteration) { | |
| 829 // Find the direction with least area increase. | |
| 830 int best_delta = -1; | |
| 831 BlobNeighbourDir best_dir = BND_LEFT; | |
| 832 TBOX expanded_boxes[BND_COUNT]; | |
| 833 for (int dir = 0; dir < BND_COUNT; ++dir) { | |
| 834 auto bnd = static_cast<BlobNeighbourDir>(dir); | |
| 835 if (!dunnit[bnd]) { | |
| 836 TBOX expanded_box; | |
| 837 int area_delta = ExpandImageDir(bnd, text_box, limit_box, part_grid, &expanded_boxes[bnd]); | |
| 838 if (best_delta < 0 || area_delta < best_delta) { | |
| 839 best_delta = area_delta; | |
| 840 best_dir = bnd; | |
| 841 } | |
| 842 } | |
| 843 } | |
| 844 // Run the best and remember the direction. | |
| 845 dunnit[best_dir] = true; | |
| 846 text_box = expanded_boxes[best_dir]; | |
| 847 } | |
| 848 *im_box = text_box; | |
| 849 } | |
| 850 | |
| 851 // Helper deletes the given partition but first marks up all the blobs as | |
| 852 // noise, so they get deleted later, and disowns them. | |
| 853 // If the initial type of the partition is image, then it actually deletes | |
| 854 // the blobs, as the partition owns them in that case. | |
| 855 static void DeletePartition(ColPartition *part) { | |
| 856 BlobRegionType type = part->blob_type(); | |
| 857 if (type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) { | |
| 858 // The partition owns the boxes of these types, so just delete them. | |
| 859 part->DeleteBoxes(); // From a previous iteration. | |
| 860 } else { | |
| 861 // Once marked, the blobs will be swept up by TidyBlobs. | |
| 862 part->set_flow(BTFT_NONTEXT); | |
| 863 part->set_blob_type(BRT_NOISE); | |
| 864 part->SetBlobTypes(); | |
| 865 part->DisownBoxes(); // Created before FindImagePartitions. | |
| 866 } | |
| 867 delete part; | |
| 868 } | |
| 869 | |
| 870 // The meat of joining fragmented images and consuming ColPartitions of | |
| 871 // uncertain type. | |
| 872 // *part_ptr is an input/output BRT_RECTIMAGE ColPartition that is to be | |
| 873 // expanded to consume overlapping and nearby ColPartitions of uncertain type | |
| 874 // and other BRT_RECTIMAGE partitions, but NOT to be expanded beyond | |
| 875 // max_image_box. *part_ptr is NOT in the part_grid. | |
| 876 // rectsearch is already constructed on the part_grid, and is used for | |
| 877 // searching for overlapping and nearby ColPartitions. | |
| 878 // ExpandImageIntoParts is called iteratively until it returns false. Each | |
| 879 // time it absorbs the nearest non-contained candidate, and everything that | |
| 880 // is fully contained within part_ptr's bounding box. | |
| 881 // TODO(rays) what if it just eats everything inside max_image_box in one go? | |
| 882 static bool ExpandImageIntoParts(const TBOX &max_image_box, ColPartitionGridSearch *rectsearch, | |
| 883 ColPartitionGrid *part_grid, ColPartition **part_ptr) { | |
| 884 ColPartition *image_part = *part_ptr; | |
| 885 TBOX im_part_box = image_part->bounding_box(); | |
| 886 if (textord_tabfind_show_images > 1) { | |
| 887 tprintf("Searching for merge with image part:"); | |
| 888 im_part_box.print(); | |
| 889 tprintf("Text box="); | |
| 890 max_image_box.print(); | |
| 891 } | |
| 892 rectsearch->StartRectSearch(max_image_box); | |
| 893 ColPartition *part; | |
| 894 ColPartition *best_part = nullptr; | |
| 895 int best_dist = 0; | |
| 896 while ((part = rectsearch->NextRectSearch()) != nullptr) { | |
| 897 if (textord_tabfind_show_images > 1) { | |
| 898 tprintf("Considering merge with part:"); | |
| 899 part->Print(); | |
| 900 if (im_part_box.contains(part->bounding_box())) { | |
| 901 tprintf("Fully contained\n"); | |
| 902 } else if (!max_image_box.contains(part->bounding_box())) { | |
| 903 tprintf("Not within text box\n"); | |
| 904 } else if (part->flow() == BTFT_STRONG_CHAIN) { | |
| 905 tprintf("Too strong text\n"); | |
| 906 } else { | |
| 907 tprintf("Real candidate\n"); | |
| 908 } | |
| 909 } | |
| 910 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_TEXT_ON_IMAGE || | |
| 911 part->blob_type() == BRT_POLYIMAGE) { | |
| 912 continue; | |
| 913 } | |
| 914 TBOX box = part->bounding_box(); | |
| 915 if (max_image_box.contains(box) && part->blob_type() != BRT_NOISE) { | |
| 916 if (im_part_box.contains(box)) { | |
| 917 // Eat it completely. | |
| 918 rectsearch->RemoveBBox(); | |
| 919 DeletePartition(part); | |
| 920 continue; | |
| 921 } | |
| 922 int x_dist = std::max(0, box.x_gap(im_part_box)); | |
| 923 int y_dist = std::max(0, box.y_gap(im_part_box)); | |
| 924 int dist = x_dist * x_dist + y_dist * y_dist; | |
| 925 if (dist > box.area() || dist > im_part_box.area()) { | |
| 926 continue; // Not close enough. | |
| 927 } | |
| 928 if (best_part == nullptr || dist < best_dist) { | |
| 929 // We keep the nearest qualifier, which is not necessarily the nearest. | |
| 930 best_part = part; | |
| 931 best_dist = dist; | |
| 932 } | |
| 933 } | |
| 934 } | |
| 935 if (best_part != nullptr) { | |
| 936 // It needs expanding. We can do it without touching text. | |
| 937 TBOX box = best_part->bounding_box(); | |
| 938 if (textord_tabfind_show_images > 1) { | |
| 939 tprintf("Merging image part:"); | |
| 940 im_part_box.print(); | |
| 941 tprintf("with part:"); | |
| 942 box.print(); | |
| 943 } | |
| 944 im_part_box += box; | |
| 945 *part_ptr = ColPartition::FakePartition(im_part_box, PT_UNKNOWN, BRT_RECTIMAGE, BTFT_NONTEXT); | |
| 946 DeletePartition(image_part); | |
| 947 part_grid->RemoveBBox(best_part); | |
| 948 DeletePartition(best_part); | |
| 949 rectsearch->RepositionIterator(); | |
| 950 return true; | |
| 951 } | |
| 952 return false; | |
| 953 } | |
| 954 | |
| 955 // Helper function to compute the overlap area between the box and the | |
| 956 // given list of partitions. | |
| 957 static int IntersectArea(const TBOX &box, ColPartition_LIST *part_list) { | |
| 958 int intersect_area = 0; | |
| 959 ColPartition_IT part_it(part_list); | |
| 960 // Iterate the parts and subtract intersecting area. | |
| 961 for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) { | |
| 962 ColPartition *image_part = part_it.data(); | |
| 963 TBOX intersect = box.intersection(image_part->bounding_box()); | |
| 964 intersect_area += intersect.area(); | |
| 965 } | |
| 966 return intersect_area; | |
| 967 } | |
| 968 | |
| 969 // part_list is a set of ColPartitions representing a polygonal image, and | |
| 970 // im_box is the union of the bounding boxes of all the parts in part_list. | |
| 971 // Tests whether part is to be consumed by the polygonal image. | |
| 972 // Returns true if part is weak text and more than half of its area is | |
| 973 // intersected by parts from the part_list, and it is contained within im_box. | |
| 974 static bool TestWeakIntersectedPart(const TBOX &im_box, ColPartition_LIST *part_list, | |
| 975 ColPartition *part) { | |
| 976 if (part->flow() < BTFT_STRONG_CHAIN) { | |
| 977 // A weak partition intersects the box. | |
| 978 const TBOX &part_box = part->bounding_box(); | |
| 979 if (im_box.contains(part_box)) { | |
| 980 int area = part_box.area(); | |
| 981 int intersect_area = IntersectArea(part_box, part_list); | |
| 982 if (area < 2 * intersect_area) { | |
| 983 return true; | |
| 984 } | |
| 985 } | |
| 986 } | |
| 987 return false; | |
| 988 } | |
| 989 | |
| 990 // A rectangular or polygonal image has been completed, in part_list, bounding | |
| 991 // box in im_box. We want to eliminate weak text or other uncertain partitions | |
| 992 // (basically anything that is not BRT_STRONG_CHAIN or better) from both the | |
| 993 // part_grid and the big_parts list that are contained within im_box and | |
| 994 // overlapped enough by the possibly polygonal image. | |
| 995 static void EliminateWeakParts(const TBOX &im_box, ColPartitionGrid *part_grid, | |
| 996 ColPartition_LIST *big_parts, ColPartition_LIST *part_list) { | |
| 997 ColPartitionGridSearch rectsearch(part_grid); | |
| 998 ColPartition *part; | |
| 999 rectsearch.StartRectSearch(im_box); | |
| 1000 while ((part = rectsearch.NextRectSearch()) != nullptr) { | |
| 1001 if (TestWeakIntersectedPart(im_box, part_list, part)) { | |
| 1002 BlobRegionType type = part->blob_type(); | |
| 1003 if (type == BRT_POLYIMAGE || type == BRT_RECTIMAGE) { | |
| 1004 rectsearch.RemoveBBox(); | |
| 1005 DeletePartition(part); | |
| 1006 } else { | |
| 1007 // The part is mostly covered, so mark it. Non-image partitions are | |
| 1008 // kept hanging around to mark the image for pass2 | |
| 1009 part->set_flow(BTFT_NONTEXT); | |
| 1010 part->set_blob_type(BRT_NOISE); | |
| 1011 part->SetBlobTypes(); | |
| 1012 } | |
| 1013 } | |
| 1014 } | |
| 1015 ColPartition_IT big_it(big_parts); | |
| 1016 for (big_it.mark_cycle_pt(); !big_it.cycled_list(); big_it.forward()) { | |
| 1017 part = big_it.data(); | |
| 1018 if (TestWeakIntersectedPart(im_box, part_list, part)) { | |
| 1019 // Once marked, the blobs will be swept up by TidyBlobs. | |
| 1020 DeletePartition(big_it.extract()); | |
| 1021 } | |
| 1022 } | |
| 1023 } | |
| 1024 | |
| 1025 // Helper scans for good text partitions overlapping the given box. | |
| 1026 // If there are no good text partitions overlapping an expanded box, then | |
| 1027 // the box is expanded, otherwise, the original box is returned. | |
| 1028 // If good text overlaps the box, true is returned. | |
| 1029 static bool ScanForOverlappingText(ColPartitionGrid *part_grid, TBOX *box) { | |
| 1030 ColPartitionGridSearch rectsearch(part_grid); | |
| 1031 TBOX padded_box(*box); | |
| 1032 padded_box.pad(kNoisePadding, kNoisePadding); | |
| 1033 rectsearch.StartRectSearch(padded_box); | |
| 1034 ColPartition *part; | |
| 1035 bool any_text_in_padded_rect = false; | |
| 1036 while ((part = rectsearch.NextRectSearch()) != nullptr) { | |
| 1037 if (part->flow() == BTFT_CHAIN || part->flow() == BTFT_STRONG_CHAIN) { | |
| 1038 // Text intersects the box. | |
| 1039 any_text_in_padded_rect = true; | |
| 1040 const TBOX &part_box = part->bounding_box(); | |
| 1041 if (box->overlap(part_box)) { | |
| 1042 return true; | |
| 1043 } | |
| 1044 } | |
| 1045 } | |
| 1046 if (!any_text_in_padded_rect) { | |
| 1047 *box = padded_box; | |
| 1048 } | |
| 1049 return false; | |
| 1050 } | |
| 1051 | |
| 1052 // Renders the boxes of image parts from the supplied list onto the image_pix, | |
| 1053 // except where they interfere with existing strong text in the part_grid, | |
| 1054 // and then deletes them. | |
| 1055 // Box coordinates are rotated by rerotate to match the image. | |
| 1056 static void MarkAndDeleteImageParts(const FCOORD &rerotate, ColPartitionGrid *part_grid, | |
| 1057 ColPartition_LIST *image_parts, Image image_pix) { | |
| 1058 if (image_pix == nullptr) { | |
| 1059 return; | |
| 1060 } | |
| 1061 int imageheight = pixGetHeight(image_pix); | |
| 1062 ColPartition_IT part_it(image_parts); | |
| 1063 for (; !part_it.empty(); part_it.forward()) { | |
| 1064 ColPartition *part = part_it.extract(); | |
| 1065 TBOX part_box = part->bounding_box(); | |
| 1066 BlobRegionType type = part->blob_type(); | |
| 1067 if (!ScanForOverlappingText(part_grid, &part_box) || type == BRT_RECTIMAGE || | |
| 1068 type == BRT_POLYIMAGE) { | |
| 1069 // Mark the box on the image. | |
| 1070 // All coords need to be rotated to match the image. | |
| 1071 part_box.rotate(rerotate); | |
| 1072 int left = part_box.left(); | |
| 1073 int top = part_box.top(); | |
| 1074 pixRasterop(image_pix, left, imageheight - top, part_box.width(), part_box.height(), PIX_SET, | |
| 1075 nullptr, 0, 0); | |
| 1076 } | |
| 1077 DeletePartition(part); | |
| 1078 } | |
| 1079 } | |
| 1080 | |
| 1081 // Locates all the image partitions in the part_grid, that were found by a | |
| 1082 // previous call to FindImagePartitions, marks them in the image_mask, | |
| 1083 // removes them from the grid, and deletes them. This makes it possible to | |
| 1084 // call FindImagePartitions again to produce less broken-up and less | |
| 1085 // overlapping image partitions. | |
| 1086 // rerotation specifies how to rotate the partition coords to match | |
| 1087 // the image_mask, since this function is used after orientation correction. | |
| 1088 void ImageFind::TransferImagePartsToImageMask(const FCOORD &rerotation, ColPartitionGrid *part_grid, | |
| 1089 Image image_mask) { | |
| 1090 // Extract the noise parts from the grid and put them on a temporary list. | |
| 1091 ColPartition_LIST parts_list; | |
| 1092 ColPartition_IT part_it(&parts_list); | |
| 1093 ColPartitionGridSearch gsearch(part_grid); | |
| 1094 gsearch.StartFullSearch(); | |
| 1095 ColPartition *part; | |
| 1096 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1097 BlobRegionType type = part->blob_type(); | |
| 1098 if (type == BRT_NOISE || type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) { | |
| 1099 part_it.add_after_then_move(part); | |
| 1100 gsearch.RemoveBBox(); | |
| 1101 } | |
| 1102 } | |
| 1103 // Render listed noise partitions to the image mask. | |
| 1104 MarkAndDeleteImageParts(rerotation, part_grid, &parts_list, image_mask); | |
| 1105 } | |
| 1106 | |
| 1107 // Removes and deletes all image partitions that are too small to be worth | |
| 1108 // keeping. We have to do this as a separate phase after creating the image | |
| 1109 // partitions as the small images are needed to join the larger ones together. | |
| 1110 static void DeleteSmallImages(ColPartitionGrid *part_grid) { | |
| 1111 if (part_grid != nullptr) { | |
| 1112 return; | |
| 1113 } | |
| 1114 ColPartitionGridSearch gsearch(part_grid); | |
| 1115 gsearch.StartFullSearch(); | |
| 1116 ColPartition *part; | |
| 1117 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1118 // Only delete rectangular images, since if it became a poly image, it | |
| 1119 // is more evidence that it is somehow important. | |
| 1120 if (part->blob_type() == BRT_RECTIMAGE) { | |
| 1121 const TBOX &part_box = part->bounding_box(); | |
| 1122 if (part_box.width() < kMinImageFindSize || part_box.height() < kMinImageFindSize) { | |
| 1123 // It is too small to keep. Just make it disappear. | |
| 1124 gsearch.RemoveBBox(); | |
| 1125 DeletePartition(part); | |
| 1126 } | |
| 1127 } | |
| 1128 } | |
| 1129 } | |
| 1130 | |
| 1131 // Runs a CC analysis on the image_pix mask image, and creates | |
| 1132 // image partitions from them, cutting out strong text, and merging with | |
| 1133 // nearby image regions such that they don't interfere with text. | |
| 1134 // Rotation and rerotation specify how to rotate image coords to match | |
| 1135 // the blob and partition coords and back again. | |
| 1136 // The input/output part_grid owns all the created partitions, and | |
| 1137 // the partitions own all the fake blobs that belong in the partitions. | |
| 1138 // Since the other blobs in the other partitions will be owned by the block, | |
| 1139 // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this | |
| 1140 // situation and collect the image blobs. | |
| 1141 void ImageFind::FindImagePartitions(Image image_pix, const FCOORD &rotation, | |
| 1142 const FCOORD &rerotation, TO_BLOCK *block, TabFind *tab_grid, | |
| 1143 DebugPixa *pixa_debug, ColPartitionGrid *part_grid, | |
| 1144 ColPartition_LIST *big_parts) { | |
| 1145 int imageheight = pixGetHeight(image_pix); | |
| 1146 Boxa *boxa; | |
| 1147 Pixa *pixa; | |
| 1148 ConnCompAndRectangularize(image_pix, pixa_debug, &boxa, &pixa); | |
| 1149 // Iterate the connected components in the image regions mask. | |
| 1150 int nboxes = 0; | |
| 1151 if (boxa != nullptr && pixa != nullptr) { | |
| 1152 nboxes = boxaGetCount(boxa); | |
| 1153 } | |
| 1154 for (int i = 0; i < nboxes; ++i) { | |
| 1155 l_int32 x, y, width, height; | |
| 1156 boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height); | |
| 1157 Image pix = pixaGetPix(pixa, i, L_CLONE); | |
| 1158 TBOX im_box(x, imageheight - y - height, x + width, imageheight - y); | |
| 1159 im_box.rotate(rotation); // Now matches all partitions and blobs. | |
| 1160 ColPartitionGridSearch rectsearch(part_grid); | |
| 1161 rectsearch.SetUniqueMode(true); | |
| 1162 ColPartition_LIST part_list; | |
| 1163 DivideImageIntoParts(im_box, rotation, rerotation, pix, &rectsearch, &part_list); | |
| 1164 if (textord_tabfind_show_images && pixa_debug != nullptr) { | |
| 1165 pixa_debug->AddPix(pix, "ImageComponent"); | |
| 1166 tprintf("Component has %d parts\n", part_list.length()); | |
| 1167 } | |
| 1168 pix.destroy(); | |
| 1169 if (!part_list.empty()) { | |
| 1170 ColPartition_IT part_it(&part_list); | |
| 1171 if (part_list.singleton()) { | |
| 1172 // We didn't have to chop it into a polygon to fit around text, so | |
| 1173 // try expanding it to merge fragmented image parts, as long as it | |
| 1174 // doesn't touch strong text. | |
| 1175 ColPartition *part = part_it.extract(); | |
| 1176 TBOX text_box(im_box); | |
| 1177 MaximalImageBoundingBox(part_grid, &text_box); | |
| 1178 while (ExpandImageIntoParts(text_box, &rectsearch, part_grid, &part)) { | |
| 1179 ; | |
| 1180 } | |
| 1181 part_it.set_to_list(&part_list); | |
| 1182 part_it.add_after_then_move(part); | |
| 1183 im_box = part->bounding_box(); | |
| 1184 } | |
| 1185 EliminateWeakParts(im_box, part_grid, big_parts, &part_list); | |
| 1186 // Iterate the part_list and put the parts into the grid. | |
| 1187 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) { | |
| 1188 ColPartition *image_part = part_it.extract(); | |
| 1189 im_box = image_part->bounding_box(); | |
| 1190 part_grid->InsertBBox(true, true, image_part); | |
| 1191 if (!part_it.at_last()) { | |
| 1192 ColPartition *neighbour = part_it.data_relative(1); | |
| 1193 image_part->AddPartner(false, neighbour); | |
| 1194 neighbour->AddPartner(true, image_part); | |
| 1195 } | |
| 1196 } | |
| 1197 } | |
| 1198 } | |
| 1199 boxaDestroy(&boxa); | |
| 1200 pixaDestroy(&pixa); | |
| 1201 DeleteSmallImages(part_grid); | |
| 1202 #ifndef GRAPHICS_DISABLED | |
| 1203 if (textord_tabfind_show_images) { | |
| 1204 ScrollView *images_win_ = part_grid->MakeWindow(1000, 400, "With Images"); | |
| 1205 part_grid->DisplayBoxes(images_win_); | |
| 1206 } | |
| 1207 #endif | |
| 1208 } | |
| 1209 | |
| 1210 } // namespace tesseract. |
