Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/textlineprojection.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // Unless required by applicable law or agreed to in writing, software | |
| 9 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 11 // See the License for the specific language governing permissions and | |
| 12 // limitations under the License. | |
| 13 | |
| 14 #ifdef HAVE_CONFIG_H | |
| 15 # include "config_auto.h" | |
| 16 #endif | |
| 17 | |
| 18 #include <allheaders.h> | |
| 19 #include "bbgrid.h" // Base class. | |
| 20 #include "blobbox.h" // BlobNeighbourDir. | |
| 21 #include "blobs.h" | |
| 22 #include "colpartition.h" | |
| 23 #include "helpers.h" // for IntCastRounded | |
| 24 #include "normalis.h" | |
| 25 #include "textlineprojection.h" | |
| 26 | |
| 27 #include <algorithm> | |
| 28 | |
| 29 // Padding factor to use on definitely oriented blobs | |
| 30 const int kOrientedPadFactor = 8; | |
| 31 // Padding factor to use on not definitely oriented blobs. | |
| 32 const int kDefaultPadFactor = 2; | |
| 33 // Penalty factor for going away from the line center. | |
| 34 const int kWrongWayPenalty = 4; | |
| 35 // Ratio between parallel gap and perpendicular gap used to measure total | |
| 36 // distance of a box from a target box in curved textline space. | |
| 37 // parallel-gap is treated more favorably by this factor to allow catching | |
| 38 // quotes and ellipsis at the end of textlines. | |
| 39 const int kParaPerpDistRatio = 4; | |
| 40 // Multiple of scale_factor_ that the inter-line gap must be before we start | |
| 41 // padding the increment box perpendicular to the text line. | |
| 42 const int kMinLineSpacingFactor = 4; | |
| 43 // Maximum tab-stop overrun for horizontal padding, in projection pixels. | |
| 44 const int kMaxTabStopOverrun = 6; | |
| 45 | |
| 46 namespace tesseract { | |
| 47 | |
| 48 TextlineProjection::TextlineProjection(int resolution) : x_origin_(0), y_origin_(0), pix_(nullptr) { | |
| 49 // The projection map should be about 100 ppi, whatever the input. | |
| 50 scale_factor_ = IntCastRounded(resolution / 100.0); | |
| 51 if (scale_factor_ < 1) { | |
| 52 scale_factor_ = 1; | |
| 53 } | |
| 54 } | |
| 55 TextlineProjection::~TextlineProjection() { | |
| 56 pix_.destroy(); | |
| 57 } | |
| 58 | |
| 59 // Build the projection profile given the input_block containing lists of | |
| 60 // blobs, a rotation to convert to image coords, | |
| 61 // and a full-resolution nontext_map, marking out areas to avoid. | |
| 62 // During construction, we have the following assumptions: | |
| 63 // The rotation is a multiple of 90 degrees, ie no deskew yet. | |
| 64 // The blobs have had their left and right rules set to also limit | |
| 65 // the range of projection. | |
| 66 void TextlineProjection::ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, | |
| 67 Image nontext_map) { | |
| 68 pix_.destroy(); | |
| 69 TBOX image_box(0, 0, pixGetWidth(nontext_map), pixGetHeight(nontext_map)); | |
| 70 x_origin_ = 0; | |
| 71 y_origin_ = image_box.height(); | |
| 72 int width = (image_box.width() + scale_factor_ - 1) / scale_factor_; | |
| 73 int height = (image_box.height() + scale_factor_ - 1) / scale_factor_; | |
| 74 | |
| 75 pix_ = pixCreate(width, height, 8); | |
| 76 ProjectBlobs(&input_block->blobs, rotation, image_box, nontext_map); | |
| 77 ProjectBlobs(&input_block->large_blobs, rotation, image_box, nontext_map); | |
| 78 Image final_pix = pixBlockconv(pix_, 1, 1); | |
| 79 // Pix* final_pix = pixBlockconv(pix_, 2, 2); | |
| 80 pix_.destroy(); | |
| 81 pix_ = final_pix; | |
| 82 } | |
| 83 | |
| 84 #ifndef GRAPHICS_DISABLED | |
| 85 | |
| 86 // Display the blobs in the window colored according to textline quality. | |
| 87 void TextlineProjection::PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win) { | |
| 88 BLOBNBOX_IT it(blobs); | |
| 89 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 90 BLOBNBOX *blob = it.data(); | |
| 91 const TBOX &box = blob->bounding_box(); | |
| 92 bool bad_box = BoxOutOfHTextline(box, nullptr, false); | |
| 93 if (blob->UniquelyVertical()) { | |
| 94 win->Pen(ScrollView::YELLOW); | |
| 95 } else { | |
| 96 win->Pen(bad_box ? ScrollView::RED : ScrollView::BLUE); | |
| 97 } | |
| 98 win->Rectangle(box.left(), box.bottom(), box.right(), box.top()); | |
| 99 } | |
| 100 win->Update(); | |
| 101 } | |
| 102 | |
| 103 #endif // !GRAPHICS_DISABLED | |
| 104 | |
| 105 // Moves blobs that look like they don't sit well on a textline from the | |
| 106 // input blobs list to the output small_blobs list. | |
| 107 // This gets them away from initial textline finding to stop diacritics | |
| 108 // from forming incorrect textlines. (Introduced mainly to fix Thai.) | |
| 109 void TextlineProjection::MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, | |
| 110 BLOBNBOX_LIST *small_blobs) const { | |
| 111 BLOBNBOX_IT it(blobs); | |
| 112 BLOBNBOX_IT small_it(small_blobs); | |
| 113 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 114 BLOBNBOX *blob = it.data(); | |
| 115 const TBOX &box = blob->bounding_box(); | |
| 116 bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom()); | |
| 117 if (BoxOutOfHTextline(box, nullptr, debug) && !blob->UniquelyVertical()) { | |
| 118 blob->ClearNeighbours(); | |
| 119 small_it.add_to_end(it.extract()); | |
| 120 } | |
| 121 } | |
| 122 } | |
| 123 | |
| 124 #ifndef GRAPHICS_DISABLED | |
| 125 | |
| 126 // Create a window and display the projection in it. | |
| 127 void TextlineProjection::DisplayProjection() const { | |
| 128 int width = pixGetWidth(pix_); | |
| 129 int height = pixGetHeight(pix_); | |
| 130 Image pixc = pixCreate(width, height, 32); | |
| 131 int src_wpl = pixGetWpl(pix_); | |
| 132 int col_wpl = pixGetWpl(pixc); | |
| 133 uint32_t *src_data = pixGetData(pix_); | |
| 134 uint32_t *col_data = pixGetData(pixc); | |
| 135 for (int y = 0; y < height; ++y, src_data += src_wpl, col_data += col_wpl) { | |
| 136 for (int x = 0; x < width; ++x) { | |
| 137 int pixel = GET_DATA_BYTE(src_data, x); | |
| 138 l_uint32 result; | |
| 139 if (pixel <= 17) { | |
| 140 composeRGBPixel(0, 0, pixel * 15, &result); | |
| 141 } else if (pixel <= 145) { | |
| 142 composeRGBPixel(0, (pixel - 17) * 2, 255, &result); | |
| 143 } else { | |
| 144 composeRGBPixel((pixel - 145) * 2, 255, 255, &result); | |
| 145 } | |
| 146 col_data[x] = result; | |
| 147 } | |
| 148 } | |
| 149 auto *win = new ScrollView("Projection", 0, 0, width, height, width, height); | |
| 150 win->Draw(pixc, 0, 0); | |
| 151 win->Update(); | |
| 152 pixc.destroy(); | |
| 153 } | |
| 154 | |
| 155 #endif // !GRAPHICS_DISABLED | |
| 156 | |
| 157 // Compute the distance of the box from the partition using curved projection | |
| 158 // space. As DistanceOfBoxFromBox, except that the direction is taken from | |
| 159 // the ColPartition and the median bounds of the ColPartition are used as | |
| 160 // the to_box. | |
| 161 int TextlineProjection::DistanceOfBoxFromPartition(const TBOX &box, const ColPartition &part, | |
| 162 const DENORM *denorm, bool debug) const { | |
| 163 // Compute a partition box that uses the median top/bottom of the blobs | |
| 164 // within and median left/right for vertical. | |
| 165 TBOX part_box = part.bounding_box(); | |
| 166 if (part.IsHorizontalType()) { | |
| 167 part_box.set_top(part.median_top()); | |
| 168 part_box.set_bottom(part.median_bottom()); | |
| 169 } else { | |
| 170 part_box.set_left(part.median_left()); | |
| 171 part_box.set_right(part.median_right()); | |
| 172 } | |
| 173 // Now use DistanceOfBoxFromBox to make the actual calculation. | |
| 174 return DistanceOfBoxFromBox(box, part_box, part.IsHorizontalType(), denorm, debug); | |
| 175 } | |
| 176 | |
| 177 // Compute the distance from the from_box to the to_box using curved | |
| 178 // projection space. Separation that involves a decrease in projection | |
| 179 // density (moving from the from_box to the to_box) is weighted more heavily | |
| 180 // than constant density, and an increase is weighted less. | |
| 181 // If horizontal_textline is true, then curved space is used vertically, | |
| 182 // as for a diacritic on the edge of a textline. | |
| 183 // The projection uses original image coords, so denorm is used to get | |
| 184 // back to the image coords from box/part space. | |
| 185 // How the calculation works: Think of a diacritic near a textline. | |
| 186 // Distance is measured from the far side of the from_box to the near side of | |
| 187 // the to_box. Shown is the horizontal textline case. | |
| 188 // |------^-----| | |
| 189 // | from | box | | |
| 190 // |------|-----| | |
| 191 // perpendicular | | |
| 192 // <------v-------->|--------------------| | |
| 193 // parallel | to box | | |
| 194 // |--------------------| | |
| 195 // Perpendicular distance uses "curved space" See VerticalDistance below. | |
| 196 // Parallel distance is linear. | |
| 197 // Result is perpendicular_gap + parallel_gap / kParaPerpDistRatio. | |
| 198 int TextlineProjection::DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, | |
| 199 bool horizontal_textline, const DENORM *denorm, | |
| 200 bool debug) const { | |
| 201 // The parallel_gap is the horizontal gap between a horizontal textline and | |
| 202 // the box. Analogous for vertical. | |
| 203 int parallel_gap = 0; | |
| 204 // start_pt is the box end of the line to be modified for curved space. | |
| 205 TPOINT start_pt; | |
| 206 // end_pt is the partition end of the line to be modified for curved space. | |
| 207 TPOINT end_pt; | |
| 208 if (horizontal_textline) { | |
| 209 parallel_gap = from_box.x_gap(to_box) + from_box.width(); | |
| 210 start_pt.x = (from_box.left() + from_box.right()) / 2; | |
| 211 end_pt.x = start_pt.x; | |
| 212 if (from_box.top() - to_box.top() >= to_box.bottom() - from_box.bottom()) { | |
| 213 start_pt.y = from_box.top(); | |
| 214 end_pt.y = std::min(to_box.top(), start_pt.y); | |
| 215 } else { | |
| 216 start_pt.y = from_box.bottom(); | |
| 217 end_pt.y = std::max(to_box.bottom(), start_pt.y); | |
| 218 } | |
| 219 } else { | |
| 220 parallel_gap = from_box.y_gap(to_box) + from_box.height(); | |
| 221 if (from_box.right() - to_box.right() >= to_box.left() - from_box.left()) { | |
| 222 start_pt.x = from_box.right(); | |
| 223 end_pt.x = std::min(to_box.right(), start_pt.x); | |
| 224 } else { | |
| 225 start_pt.x = from_box.left(); | |
| 226 end_pt.x = std::max(to_box.left(), start_pt.x); | |
| 227 } | |
| 228 start_pt.y = (from_box.bottom() + from_box.top()) / 2; | |
| 229 end_pt.y = start_pt.y; | |
| 230 } | |
| 231 // The perpendicular gap is the max vertical distance gap out of: | |
| 232 // top of from_box to to_box top and bottom of from_box to to_box bottom. | |
| 233 // This value is then modified for curved projection space. | |
| 234 // Analogous for vertical. | |
| 235 int perpendicular_gap = 0; | |
| 236 // If start_pt == end_pt, then the from_box lies entirely within the to_box | |
| 237 // (in the perpendicular direction), so we don't need to calculate the | |
| 238 // perpendicular_gap. | |
| 239 if (start_pt.x != end_pt.x || start_pt.y != end_pt.y) { | |
| 240 if (denorm != nullptr) { | |
| 241 // Denormalize the start and end. | |
| 242 denorm->DenormTransform(nullptr, start_pt, &start_pt); | |
| 243 denorm->DenormTransform(nullptr, end_pt, &end_pt); | |
| 244 } | |
| 245 if (abs(start_pt.y - end_pt.y) >= abs(start_pt.x - end_pt.x)) { | |
| 246 perpendicular_gap = VerticalDistance(debug, start_pt.x, start_pt.y, end_pt.y); | |
| 247 } else { | |
| 248 perpendicular_gap = HorizontalDistance(debug, start_pt.x, end_pt.x, start_pt.y); | |
| 249 } | |
| 250 } | |
| 251 // The parallel_gap weighs less than the perpendicular_gap. | |
| 252 return perpendicular_gap + parallel_gap / kParaPerpDistRatio; | |
| 253 } | |
| 254 | |
| 255 // Compute the distance between (x, y1) and (x, y2) using the rule that | |
| 256 // a decrease in textline density is weighted more heavily than an increase. | |
| 257 // The coordinates are in source image space, ie processed by any denorm | |
| 258 // already, but not yet scaled by scale_factor_. | |
| 259 // Going from the outside of a textline to the inside should measure much | |
| 260 // less distance than going from the inside of a textline to the outside. | |
| 261 // How it works: | |
| 262 // An increase is cheap (getting closer to a textline). | |
| 263 // Constant costs unity. | |
| 264 // A decrease is expensive (getting further from a textline). | |
| 265 // Pixels in projection map Counted distance | |
| 266 // 2 | |
| 267 // 3 1/x | |
| 268 // 3 1 | |
| 269 // 2 x | |
| 270 // 5 1/x | |
| 271 // 7 1/x | |
| 272 // Total: 1 + x + 3/x where x = kWrongWayPenalty. | |
| 273 int TextlineProjection::VerticalDistance(bool debug, int x, int y1, int y2) const { | |
| 274 x = ImageXToProjectionX(x); | |
| 275 y1 = ImageYToProjectionY(y1); | |
| 276 y2 = ImageYToProjectionY(y2); | |
| 277 if (y1 == y2) { | |
| 278 return 0; | |
| 279 } | |
| 280 int wpl = pixGetWpl(pix_); | |
| 281 int step = y1 < y2 ? 1 : -1; | |
| 282 uint32_t *data = pixGetData(pix_) + y1 * wpl; | |
| 283 wpl *= step; | |
| 284 int prev_pixel = GET_DATA_BYTE(data, x); | |
| 285 int distance = 0; | |
| 286 int right_way_steps = 0; | |
| 287 for (int y = y1; y != y2; y += step) { | |
| 288 data += wpl; | |
| 289 int pixel = GET_DATA_BYTE(data, x); | |
| 290 if (debug) { | |
| 291 tprintf("At (%d,%d), pix = %d, prev=%d\n", x, y + step, pixel, prev_pixel); | |
| 292 } | |
| 293 if (pixel < prev_pixel) { | |
| 294 distance += kWrongWayPenalty; | |
| 295 } else if (pixel > prev_pixel) { | |
| 296 ++right_way_steps; | |
| 297 } else { | |
| 298 ++distance; | |
| 299 } | |
| 300 prev_pixel = pixel; | |
| 301 } | |
| 302 return distance * scale_factor_ + right_way_steps * scale_factor_ / kWrongWayPenalty; | |
| 303 } | |
| 304 | |
| 305 // Compute the distance between (x1, y) and (x2, y) using the rule that | |
| 306 // a decrease in textline density is weighted more heavily than an increase. | |
| 307 int TextlineProjection::HorizontalDistance(bool debug, int x1, int x2, int y) const { | |
| 308 x1 = ImageXToProjectionX(x1); | |
| 309 x2 = ImageXToProjectionX(x2); | |
| 310 y = ImageYToProjectionY(y); | |
| 311 if (x1 == x2) { | |
| 312 return 0; | |
| 313 } | |
| 314 int wpl = pixGetWpl(pix_); | |
| 315 int step = x1 < x2 ? 1 : -1; | |
| 316 uint32_t *data = pixGetData(pix_) + y * wpl; | |
| 317 int prev_pixel = GET_DATA_BYTE(data, x1); | |
| 318 int distance = 0; | |
| 319 int right_way_steps = 0; | |
| 320 for (int x = x1; x != x2; x += step) { | |
| 321 int pixel = GET_DATA_BYTE(data, x + step); | |
| 322 if (debug) { | |
| 323 tprintf("At (%d,%d), pix = %d, prev=%d\n", x + step, y, pixel, prev_pixel); | |
| 324 } | |
| 325 if (pixel < prev_pixel) { | |
| 326 distance += kWrongWayPenalty; | |
| 327 } else if (pixel > prev_pixel) { | |
| 328 ++right_way_steps; | |
| 329 } else { | |
| 330 ++distance; | |
| 331 } | |
| 332 prev_pixel = pixel; | |
| 333 } | |
| 334 return distance * scale_factor_ + right_way_steps * scale_factor_ / kWrongWayPenalty; | |
| 335 } | |
| 336 | |
| 337 // Returns true if the blob appears to be outside of a textline. | |
| 338 // Such blobs are potentially diacritics (even if large in Thai) and should | |
| 339 // be kept away from initial textline finding. | |
| 340 bool TextlineProjection::BoxOutOfHTextline(const TBOX &box, const DENORM *denorm, | |
| 341 bool debug) const { | |
| 342 int grad1 = 0; | |
| 343 int grad2 = 0; | |
| 344 EvaluateBoxInternal(box, denorm, debug, &grad1, &grad2, nullptr, nullptr); | |
| 345 int worst_result = std::min(grad1, grad2); | |
| 346 int total_result = grad1 + grad2; | |
| 347 if (total_result >= 6) { | |
| 348 return false; // Strongly in textline. | |
| 349 } | |
| 350 // Medium strength: if either gradient is negative, it is likely outside | |
| 351 // the body of the textline. | |
| 352 if (worst_result < 0) { | |
| 353 return true; | |
| 354 } | |
| 355 return false; | |
| 356 } | |
| 357 | |
| 358 // Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below, | |
| 359 // but uses the median top/bottom for horizontal and median left/right for | |
| 360 // vertical instead of the bounding box edges. | |
| 361 // Evaluates for both horizontal and vertical and returns the best result, | |
| 362 // with a positive value for horizontal and a negative value for vertical. | |
| 363 int TextlineProjection::EvaluateColPartition(const ColPartition &part, const DENORM *denorm, | |
| 364 bool debug) const { | |
| 365 if (part.IsSingleton()) { | |
| 366 return EvaluateBox(part.bounding_box(), denorm, debug); | |
| 367 } | |
| 368 // Test vertical orientation. | |
| 369 TBOX box = part.bounding_box(); | |
| 370 // Use the partition median for left/right. | |
| 371 box.set_left(part.median_left()); | |
| 372 box.set_right(part.median_right()); | |
| 373 int vresult = EvaluateBox(box, denorm, debug); | |
| 374 | |
| 375 // Test horizontal orientation. | |
| 376 box = part.bounding_box(); | |
| 377 // Use the partition median for top/bottom. | |
| 378 box.set_top(part.median_top()); | |
| 379 box.set_bottom(part.median_bottom()); | |
| 380 int hresult = EvaluateBox(box, denorm, debug); | |
| 381 if (debug) { | |
| 382 tprintf("Partition hresult=%d, vresult=%d from:", hresult, vresult); | |
| 383 part.bounding_box().print(); | |
| 384 part.Print(); | |
| 385 } | |
| 386 return hresult >= -vresult ? hresult : vresult; | |
| 387 } | |
| 388 | |
| 389 // Computes the mean projection gradients over the horizontal and vertical | |
| 390 // edges of the box: | |
| 391 // -h-h-h-h-h-h | |
| 392 // |------------| mean=htop -v|+v--------+v|-v | |
| 393 // |+h+h+h+h+h+h| -v|+v +v|-v | |
| 394 // | | -v|+v +v|-v | |
| 395 // | box | -v|+v box +v|-v | |
| 396 // | | -v|+v +v|-v | |
| 397 // |+h+h+h+h+h+h| -v|+v +v|-v | |
| 398 // |------------| mean=hbot -v|+v--------+v|-v | |
| 399 // -h-h-h-h-h-h | |
| 400 // mean=vleft mean=vright | |
| 401 // | |
| 402 // Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number | |
| 403 // for a horizontal textline, a negative number for a vertical textline, | |
| 404 // and near zero for undecided. Undecided is most likely non-text. | |
| 405 // All the gradients are truncated to remain non-negative, since negative | |
| 406 // horizontal gradients don't give any indication of being vertical and | |
| 407 // vice versa. | |
| 408 // Additional complexity: The coordinates have to be transformed to original | |
| 409 // image coordinates with denorm (if not null), scaled to match the projection | |
| 410 // pix, and THEN step out 2 pixels each way from the edge to compute the | |
| 411 // gradient, and tries 3 positions, each measuring the gradient over a | |
| 412 // 4-pixel spread: (+3/-1), (+2/-2), (+1/-3). This complexity is handled by | |
| 413 // several layers of helpers below. | |
| 414 int TextlineProjection::EvaluateBox(const TBOX &box, const DENORM *denorm, bool debug) const { | |
| 415 return EvaluateBoxInternal(box, denorm, debug, nullptr, nullptr, nullptr, nullptr); | |
| 416 } | |
| 417 | |
| 418 // Internal version of EvaluateBox returns the unclipped gradients as well | |
| 419 // as the result of EvaluateBox. | |
| 420 // hgrad1 and hgrad2 are the gradients for the horizontal textline. | |
| 421 int TextlineProjection::EvaluateBoxInternal(const TBOX &box, const DENORM *denorm, bool debug, | |
| 422 int *hgrad1, int *hgrad2, int *vgrad1, | |
| 423 int *vgrad2) const { | |
| 424 int top_gradient = BestMeanGradientInRow(denorm, box.left(), box.right(), box.top(), true); | |
| 425 int bottom_gradient = | |
| 426 -BestMeanGradientInRow(denorm, box.left(), box.right(), box.bottom(), false); | |
| 427 int left_gradient = BestMeanGradientInColumn(denorm, box.left(), box.bottom(), box.top(), true); | |
| 428 int right_gradient = | |
| 429 -BestMeanGradientInColumn(denorm, box.right(), box.bottom(), box.top(), false); | |
| 430 int top_clipped = std::max(top_gradient, 0); | |
| 431 int bottom_clipped = std::max(bottom_gradient, 0); | |
| 432 int left_clipped = std::max(left_gradient, 0); | |
| 433 int right_clipped = std::max(right_gradient, 0); | |
| 434 if (debug) { | |
| 435 tprintf("Gradients: top = %d, bottom = %d, left= %d, right= %d for box:", top_gradient, | |
| 436 bottom_gradient, left_gradient, right_gradient); | |
| 437 box.print(); | |
| 438 } | |
| 439 int result = std::max(top_clipped, bottom_clipped) - std::max(left_clipped, right_clipped); | |
| 440 if (hgrad1 != nullptr && hgrad2 != nullptr) { | |
| 441 *hgrad1 = top_gradient; | |
| 442 *hgrad2 = bottom_gradient; | |
| 443 } | |
| 444 if (vgrad1 != nullptr && vgrad2 != nullptr) { | |
| 445 *vgrad1 = left_gradient; | |
| 446 *vgrad2 = right_gradient; | |
| 447 } | |
| 448 return result; | |
| 449 } | |
| 450 | |
| 451 // Helper returns the mean gradient value for the horizontal row at the given | |
| 452 // y, (in the external coordinates) by subtracting the mean of the transformed | |
| 453 // row 2 pixels above from the mean of the transformed row 2 pixels below. | |
| 454 // This gives a positive value for a good top edge and negative for bottom. | |
| 455 // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge. | |
| 456 int TextlineProjection::BestMeanGradientInRow(const DENORM *denorm, int16_t min_x, int16_t max_x, | |
| 457 int16_t y, bool best_is_max) const { | |
| 458 TPOINT start_pt(min_x, y); | |
| 459 TPOINT end_pt(max_x, y); | |
| 460 int upper = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt); | |
| 461 int lower = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt); | |
| 462 int best_gradient = lower - upper; | |
| 463 upper = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt); | |
| 464 lower = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt); | |
| 465 int gradient = lower - upper; | |
| 466 if ((gradient > best_gradient) == best_is_max) { | |
| 467 best_gradient = gradient; | |
| 468 } | |
| 469 upper = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt); | |
| 470 lower = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt); | |
| 471 gradient = lower - upper; | |
| 472 if ((gradient > best_gradient) == best_is_max) { | |
| 473 best_gradient = gradient; | |
| 474 } | |
| 475 return best_gradient; | |
| 476 } | |
| 477 | |
| 478 // Helper returns the mean gradient value for the vertical column at the | |
| 479 // given x, (in the external coordinates) by subtracting the mean of the | |
| 480 // transformed column 2 pixels left from the mean of the transformed column | |
| 481 // 2 pixels to the right. | |
| 482 // This gives a positive value for a good left edge and negative for right. | |
| 483 // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge. | |
| 484 int TextlineProjection::BestMeanGradientInColumn(const DENORM *denorm, int16_t x, int16_t min_y, | |
| 485 int16_t max_y, bool best_is_max) const { | |
| 486 TPOINT start_pt(x, min_y); | |
| 487 TPOINT end_pt(x, max_y); | |
| 488 int left = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt); | |
| 489 int right = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt); | |
| 490 int best_gradient = right - left; | |
| 491 left = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt); | |
| 492 right = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt); | |
| 493 int gradient = right - left; | |
| 494 if ((gradient > best_gradient) == best_is_max) { | |
| 495 best_gradient = gradient; | |
| 496 } | |
| 497 left = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt); | |
| 498 right = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt); | |
| 499 gradient = right - left; | |
| 500 if ((gradient > best_gradient) == best_is_max) { | |
| 501 best_gradient = gradient; | |
| 502 } | |
| 503 return best_gradient; | |
| 504 } | |
| 505 | |
| 506 // Helper returns the mean pixel value over the line between the start_pt and | |
| 507 // end_pt (inclusive), but shifted perpendicular to the line in the projection | |
| 508 // image by offset pixels. For simplicity, it is assumed that the vector is | |
| 509 // either nearly horizontal or nearly vertical. It works on skewed textlines! | |
| 510 // The end points are in external coordinates, and will be denormalized with | |
| 511 // the denorm if not nullptr before further conversion to pix coordinates. | |
| 512 // After all the conversions, the offset is added to the direction | |
| 513 // perpendicular to the line direction. The offset is thus in projection image | |
| 514 // coordinates, which allows the caller to get a guaranteed displacement | |
| 515 // between pixels used to calculate gradients. | |
| 516 int TextlineProjection::MeanPixelsInLineSegment(const DENORM *denorm, int offset, TPOINT start_pt, | |
| 517 TPOINT end_pt) const { | |
| 518 TransformToPixCoords(denorm, &start_pt); | |
| 519 TransformToPixCoords(denorm, &end_pt); | |
| 520 TruncateToImageBounds(&start_pt); | |
| 521 TruncateToImageBounds(&end_pt); | |
| 522 int wpl = pixGetWpl(pix_); | |
| 523 uint32_t *data = pixGetData(pix_); | |
| 524 int total = 0; | |
| 525 int count = 0; | |
| 526 int x_delta = end_pt.x - start_pt.x; | |
| 527 int y_delta = end_pt.y - start_pt.y; | |
| 528 if (abs(x_delta) >= abs(y_delta)) { | |
| 529 if (x_delta == 0) { | |
| 530 return 0; | |
| 531 } | |
| 532 // Horizontal line. Add the offset vertically. | |
| 533 int x_step = x_delta > 0 ? 1 : -1; | |
| 534 // Correct offset for rotation, keeping it anti-clockwise of the delta. | |
| 535 offset *= x_step; | |
| 536 start_pt.y += offset; | |
| 537 end_pt.y += offset; | |
| 538 TruncateToImageBounds(&start_pt); | |
| 539 TruncateToImageBounds(&end_pt); | |
| 540 x_delta = end_pt.x - start_pt.x; | |
| 541 y_delta = end_pt.y - start_pt.y; | |
| 542 count = x_delta * x_step + 1; | |
| 543 for (int x = start_pt.x; x != end_pt.x; x += x_step) { | |
| 544 int y = start_pt.y + DivRounded(y_delta * (x - start_pt.x), x_delta); | |
| 545 total += GET_DATA_BYTE(data + wpl * y, x); | |
| 546 } | |
| 547 } else { | |
| 548 // Vertical line. Add the offset horizontally. | |
| 549 int y_step = y_delta > 0 ? 1 : -1; | |
| 550 // Correct offset for rotation, keeping it anti-clockwise of the delta. | |
| 551 // Pix holds the image with y=0 at the top, so the offset is negated. | |
| 552 offset *= -y_step; | |
| 553 start_pt.x += offset; | |
| 554 end_pt.x += offset; | |
| 555 TruncateToImageBounds(&start_pt); | |
| 556 TruncateToImageBounds(&end_pt); | |
| 557 x_delta = end_pt.x - start_pt.x; | |
| 558 y_delta = end_pt.y - start_pt.y; | |
| 559 count = y_delta * y_step + 1; | |
| 560 for (int y = start_pt.y; y != end_pt.y; y += y_step) { | |
| 561 int x = start_pt.x + DivRounded(x_delta * (y - start_pt.y), y_delta); | |
| 562 total += GET_DATA_BYTE(data + wpl * y, x); | |
| 563 } | |
| 564 } | |
| 565 return DivRounded(total, count); | |
| 566 } | |
| 567 | |
| 568 // Given an input pix, and a box, the sides of the box are shrunk inwards until | |
| 569 // they bound any black pixels found within the original box. | |
| 570 // The function converts between tesseract coords and the pix coords assuming | |
| 571 // that this pix is full resolution equal in size to the original image. | |
| 572 // Returns an empty box if there are no black pixels in the source box. | |
| 573 static TBOX BoundsWithinBox(Image pix, const TBOX &box) { | |
| 574 int im_height = pixGetHeight(pix); | |
| 575 Box *input_box = boxCreate(box.left(), im_height - box.top(), box.width(), box.height()); | |
| 576 Box *output_box = nullptr; | |
| 577 pixClipBoxToForeground(pix, input_box, nullptr, &output_box); | |
| 578 TBOX result_box; | |
| 579 if (output_box != nullptr) { | |
| 580 l_int32 x, y, width, height; | |
| 581 boxGetGeometry(output_box, &x, &y, &width, &height); | |
| 582 result_box.set_left(x); | |
| 583 result_box.set_right(x + width); | |
| 584 result_box.set_top(im_height - y); | |
| 585 result_box.set_bottom(result_box.top() - height); | |
| 586 boxDestroy(&output_box); | |
| 587 } | |
| 588 boxDestroy(&input_box); | |
| 589 return result_box; | |
| 590 } | |
| 591 | |
| 592 // Splits the given box in half at x_middle or y_middle according to split_on_x | |
| 593 // and checks for nontext_map pixels in each half. Reduces the bbox so that it | |
| 594 // still includes the middle point, but does not touch any fg pixels in | |
| 595 // nontext_map. An empty box may be returned if there is no such box. | |
| 596 static void TruncateBoxToMissNonText(int x_middle, int y_middle, bool split_on_x, Image nontext_map, | |
| 597 TBOX *bbox) { | |
| 598 TBOX box1(*bbox); | |
| 599 TBOX box2(*bbox); | |
| 600 TBOX im_box; | |
| 601 if (split_on_x) { | |
| 602 box1.set_right(x_middle); | |
| 603 im_box = BoundsWithinBox(nontext_map, box1); | |
| 604 if (!im_box.null_box()) { | |
| 605 box1.set_left(im_box.right()); | |
| 606 } | |
| 607 box2.set_left(x_middle); | |
| 608 im_box = BoundsWithinBox(nontext_map, box2); | |
| 609 if (!im_box.null_box()) { | |
| 610 box2.set_right(im_box.left()); | |
| 611 } | |
| 612 } else { | |
| 613 box1.set_bottom(y_middle); | |
| 614 im_box = BoundsWithinBox(nontext_map, box1); | |
| 615 if (!im_box.null_box()) { | |
| 616 box1.set_top(im_box.bottom()); | |
| 617 } | |
| 618 box2.set_top(y_middle); | |
| 619 im_box = BoundsWithinBox(nontext_map, box2); | |
| 620 if (!im_box.null_box()) { | |
| 621 box2.set_bottom(im_box.top()); | |
| 622 } | |
| 623 } | |
| 624 box1 += box2; | |
| 625 *bbox = box1; | |
| 626 } | |
| 627 | |
| 628 // Helper function to add 1 to a rectangle in source image coords to the | |
| 629 // internal projection pix_. | |
| 630 void TextlineProjection::IncrementRectangle8Bit(const TBOX &box) { | |
| 631 int scaled_left = ImageXToProjectionX(box.left()); | |
| 632 int scaled_top = ImageYToProjectionY(box.top()); | |
| 633 int scaled_right = ImageXToProjectionX(box.right()); | |
| 634 int scaled_bottom = ImageYToProjectionY(box.bottom()); | |
| 635 int wpl = pixGetWpl(pix_); | |
| 636 uint32_t *data = pixGetData(pix_) + scaled_top * wpl; | |
| 637 for (int y = scaled_top; y <= scaled_bottom; ++y) { | |
| 638 for (int x = scaled_left; x <= scaled_right; ++x) { | |
| 639 int pixel = GET_DATA_BYTE(data, x); | |
| 640 if (pixel < 255) { | |
| 641 SET_DATA_BYTE(data, x, pixel + 1); | |
| 642 } | |
| 643 } | |
| 644 data += wpl; | |
| 645 } | |
| 646 } | |
| 647 | |
| 648 // Inserts a list of blobs into the projection. | |
| 649 // Rotation is a multiple of 90 degrees to get from blob coords to | |
| 650 // nontext_map coords, nontext_map_box is the bounds of the nontext_map. | |
| 651 // Blobs are spread horizontally or vertically according to their internal | |
| 652 // flags, but the spreading is truncated by set pixels in the nontext_map | |
| 653 // and also by the horizontal rule line limits on the blobs. | |
| 654 void TextlineProjection::ProjectBlobs(BLOBNBOX_LIST *blobs, const FCOORD &rotation, | |
| 655 const TBOX &nontext_map_box, Image nontext_map) { | |
| 656 BLOBNBOX_IT blob_it(blobs); | |
| 657 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 658 BLOBNBOX *blob = blob_it.data(); | |
| 659 TBOX bbox = blob->bounding_box(); | |
| 660 ICOORD middle((bbox.left() + bbox.right()) / 2, (bbox.bottom() + bbox.top()) / 2); | |
| 661 bool spreading_horizontally = PadBlobBox(blob, &bbox); | |
| 662 // Rotate to match the nontext_map. | |
| 663 bbox.rotate(rotation); | |
| 664 middle.rotate(rotation); | |
| 665 if (rotation.x() == 0.0f) { | |
| 666 spreading_horizontally = !spreading_horizontally; | |
| 667 } | |
| 668 // Clip to the image before applying the increments. | |
| 669 bbox &= nontext_map_box; // This is in-place box intersection. | |
| 670 // Check for image pixels before spreading. | |
| 671 TruncateBoxToMissNonText(middle.x(), middle.y(), spreading_horizontally, nontext_map, &bbox); | |
| 672 if (bbox.area() > 0) { | |
| 673 IncrementRectangle8Bit(bbox); | |
| 674 } | |
| 675 } | |
| 676 } | |
| 677 | |
| 678 // Pads the bounding box of the given blob according to whether it is on | |
| 679 // a horizontal or vertical text line, taking into account tab-stops near | |
| 680 // the blob. Returns true if padding was in the horizontal direction. | |
| 681 bool TextlineProjection::PadBlobBox(BLOBNBOX *blob, TBOX *bbox) { | |
| 682 // Determine which direction to spread. | |
| 683 // If text is well spaced out, it can be useful to pad perpendicular to | |
| 684 // the textline direction, so as to ensure diacritics get absorbed | |
| 685 // correctly, but if the text is tightly spaced, this will destroy the | |
| 686 // blank space between textlines in the projection map, and that would | |
| 687 // be very bad. | |
| 688 int pad_limit = scale_factor_ * kMinLineSpacingFactor; | |
| 689 int xpad = 0; | |
| 690 int ypad = 0; | |
| 691 bool padding_horizontally = false; | |
| 692 if (blob->UniquelyHorizontal()) { | |
| 693 xpad = bbox->height() * kOrientedPadFactor; | |
| 694 padding_horizontally = true; | |
| 695 // If the text appears to be very well spaced, pad the other direction by a | |
| 696 // single pixel in the projection profile space to help join diacritics to | |
| 697 // the textline. | |
| 698 if ((blob->neighbour(BND_ABOVE) == nullptr || | |
| 699 bbox->y_gap(blob->neighbour(BND_ABOVE)->bounding_box()) > pad_limit) && | |
| 700 (blob->neighbour(BND_BELOW) == nullptr || | |
| 701 bbox->y_gap(blob->neighbour(BND_BELOW)->bounding_box()) > pad_limit)) { | |
| 702 ypad = scale_factor_; | |
| 703 } | |
| 704 } else if (blob->UniquelyVertical()) { | |
| 705 ypad = bbox->width() * kOrientedPadFactor; | |
| 706 if ((blob->neighbour(BND_LEFT) == nullptr || | |
| 707 bbox->x_gap(blob->neighbour(BND_LEFT)->bounding_box()) > pad_limit) && | |
| 708 (blob->neighbour(BND_RIGHT) == nullptr || | |
| 709 bbox->x_gap(blob->neighbour(BND_RIGHT)->bounding_box()) > pad_limit)) { | |
| 710 xpad = scale_factor_; | |
| 711 } | |
| 712 } else { | |
| 713 if ((blob->neighbour(BND_ABOVE) != nullptr && | |
| 714 blob->neighbour(BND_ABOVE)->neighbour(BND_BELOW) == blob) || | |
| 715 (blob->neighbour(BND_BELOW) != nullptr && | |
| 716 blob->neighbour(BND_BELOW)->neighbour(BND_ABOVE) == blob)) { | |
| 717 ypad = bbox->width() * kDefaultPadFactor; | |
| 718 } | |
| 719 if ((blob->neighbour(BND_RIGHT) != nullptr && | |
| 720 blob->neighbour(BND_RIGHT)->neighbour(BND_LEFT) == blob) || | |
| 721 (blob->neighbour(BND_LEFT) != nullptr && | |
| 722 blob->neighbour(BND_LEFT)->neighbour(BND_RIGHT) == blob)) { | |
| 723 xpad = bbox->height() * kDefaultPadFactor; | |
| 724 padding_horizontally = true; | |
| 725 } | |
| 726 } | |
| 727 bbox->pad(xpad, ypad); | |
| 728 pad_limit = scale_factor_ * kMaxTabStopOverrun; | |
| 729 // Now shrink horizontally to avoid stepping more than pad_limit over a | |
| 730 // tab-stop. | |
| 731 if (bbox->left() < blob->left_rule() - pad_limit) { | |
| 732 bbox->set_left(blob->left_rule() - pad_limit); | |
| 733 } | |
| 734 if (bbox->right() > blob->right_rule() + pad_limit) { | |
| 735 bbox->set_right(blob->right_rule() + pad_limit); | |
| 736 } | |
| 737 return padding_horizontally; | |
| 738 } | |
| 739 | |
| 740 // Helper denormalizes the TPOINT with the denorm if not nullptr, then | |
| 741 // converts to pix_ coordinates. | |
| 742 void TextlineProjection::TransformToPixCoords(const DENORM *denorm, TPOINT *pt) const { | |
| 743 if (denorm != nullptr) { | |
| 744 // Denormalize the point. | |
| 745 denorm->DenormTransform(nullptr, *pt, pt); | |
| 746 } | |
| 747 pt->x = ImageXToProjectionX(pt->x); | |
| 748 pt->y = ImageYToProjectionY(pt->y); | |
| 749 } | |
| 750 | |
| 751 #if defined(_MSC_VER) && !defined(__clang__) | |
| 752 # pragma optimize("g", off) | |
| 753 #endif // _MSC_VER | |
| 754 // Helper truncates the TPOINT to be within the pix_. | |
| 755 void TextlineProjection::TruncateToImageBounds(TPOINT *pt) const { | |
| 756 pt->x = ClipToRange<int>(pt->x, 0, pixGetWidth(pix_) - 1); | |
| 757 pt->y = ClipToRange<int>(pt->y, 0, pixGetHeight(pix_) - 1); | |
| 758 } | |
| 759 #if defined(_MSC_VER) && !defined(__clang__) | |
| 760 # pragma optimize("", on) | |
| 761 #endif // _MSC_VER | |
| 762 | |
| 763 // Transform tesseract image coordinates to coordinates used in the projection. | |
| 764 int TextlineProjection::ImageXToProjectionX(int x) const { | |
| 765 x = ClipToRange((x - x_origin_) / scale_factor_, 0, pixGetWidth(pix_) - 1); | |
| 766 return x; | |
| 767 } | |
| 768 int TextlineProjection::ImageYToProjectionY(int y) const { | |
| 769 y = ClipToRange((y_origin_ - y) / scale_factor_, 0, pixGetHeight(pix_) - 1); | |
| 770 return y; | |
| 771 } | |
| 772 | |
| 773 } // namespace tesseract. |
