Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tordmain.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: tordmain.cpp (Formerly textordp.c) | |
| 3 * Description: C++ top level textord code. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #define _USE_MATH_DEFINES // for M_PI | |
| 20 | |
| 21 #ifdef HAVE_CONFIG_H | |
| 22 # include "config_auto.h" | |
| 23 #endif | |
| 24 | |
| 25 #include "tordmain.h" | |
| 26 | |
| 27 #include "arrayaccess.h" // for GET_DATA_BYTE | |
| 28 #include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B... | |
| 29 #include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction | |
| 30 #include "clst.h" // for CLISTIZE | |
| 31 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE | |
| 32 #include "drawtord.h" // for plot_box_list, to_win, create_to_win | |
| 33 #include "edgblob.h" // for extract_edges | |
| 34 #include "errcode.h" // for ASSERT_HOST, ... | |
| 35 #include "makerow.h" // for textord_test_x, textord_test_y, texto... | |
| 36 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only) | |
| 37 #include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base... | |
| 38 #include "params.h" // for DoubleParam, BoolParam, IntParam | |
| 39 #include "pdblock.h" // for PDBLK | |
| 40 #include "points.h" // for FCOORD, ICOORD | |
| 41 #include "polyblk.h" // for POLY_BLOCK | |
| 42 #include "quadratc.h" // for QUAD_COEFFS | |
| 43 #include "quspline.h" // for QSPLINE, tweak_row_baseline | |
| 44 #include "rect.h" // for TBOX | |
| 45 #include "scrollview.h" // for ScrollView, ScrollView::WHITE | |
| 46 #include "statistc.h" // for STATS | |
| 47 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST | |
| 48 #include "textord.h" // for Textord, WordWithBox, WordGrid, WordS... | |
| 49 #include "tprintf.h" // for tprintf | |
| 50 #include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP | |
| 51 | |
| 52 #include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate | |
| 53 | |
| 54 #include <cfloat> // for FLT_MAX | |
| 55 #include <cmath> // for ceil, floor, M_PI | |
| 56 #include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t | |
| 57 #include <memory> | |
| 58 | |
| 59 namespace tesseract { | |
| 60 | |
| 61 #define MAX_NEAREST_DIST 600 // for block skew stats | |
| 62 | |
| 63 /********************************************************************** | |
| 64 * SetBlobStrokeWidth | |
| 65 * | |
| 66 * Set the horizontal and vertical stroke widths in the blob. | |
| 67 **********************************************************************/ | |
| 68 void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob) { | |
| 69 // Cut the blob rectangle into a Pix. | |
| 70 int pix_height = pixGetHeight(pix); | |
| 71 const TBOX &box = blob->bounding_box(); | |
| 72 int width = box.width(); | |
| 73 int height = box.height(); | |
| 74 Box *blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height); | |
| 75 Image pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr); | |
| 76 boxDestroy(&blob_pix_box); | |
| 77 Image dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); | |
| 78 pix_blob.destroy(); | |
| 79 // Compute the stroke widths. | |
| 80 uint32_t *data = pixGetData(dist_pix); | |
| 81 int wpl = pixGetWpl(dist_pix); | |
| 82 // Horizontal width of stroke. | |
| 83 STATS h_stats(0, width); | |
| 84 for (int y = 0; y < height; ++y) { | |
| 85 uint32_t *pixels = data + y * wpl; | |
| 86 int prev_pixel = 0; | |
| 87 int pixel = GET_DATA_BYTE(pixels, 0); | |
| 88 for (int x = 1; x < width; ++x) { | |
| 89 int next_pixel = GET_DATA_BYTE(pixels, x); | |
| 90 // We are looking for a pixel that is equal to its vertical neighbours, | |
| 91 // yet greater than its left neighbour. | |
| 92 if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && | |
| 93 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { | |
| 94 if (pixel > next_pixel) { | |
| 95 // Single local max, so an odd width. | |
| 96 h_stats.add(pixel * 2 - 1, 1); | |
| 97 } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) { | |
| 98 // Double local max, so an even width. | |
| 99 h_stats.add(pixel * 2, 1); | |
| 100 } | |
| 101 } | |
| 102 prev_pixel = pixel; | |
| 103 pixel = next_pixel; | |
| 104 } | |
| 105 } | |
| 106 // Vertical width of stroke. | |
| 107 STATS v_stats(0, height); | |
| 108 for (int x = 0; x < width; ++x) { | |
| 109 int prev_pixel = 0; | |
| 110 int pixel = GET_DATA_BYTE(data, x); | |
| 111 for (int y = 1; y < height; ++y) { | |
| 112 uint32_t *pixels = data + y * wpl; | |
| 113 int next_pixel = GET_DATA_BYTE(pixels, x); | |
| 114 // We are looking for a pixel that is equal to its horizontal neighbours, | |
| 115 // yet greater than its upper neighbour. | |
| 116 if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && | |
| 117 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { | |
| 118 if (pixel > next_pixel) { | |
| 119 // Single local max, so an odd width. | |
| 120 v_stats.add(pixel * 2 - 1, 1); | |
| 121 } else if (pixel == next_pixel && y + 1 < height && | |
| 122 pixel > GET_DATA_BYTE(pixels + wpl, x)) { | |
| 123 // Double local max, so an even width. | |
| 124 v_stats.add(pixel * 2, 1); | |
| 125 } | |
| 126 } | |
| 127 prev_pixel = pixel; | |
| 128 pixel = next_pixel; | |
| 129 } | |
| 130 } | |
| 131 dist_pix.destroy(); | |
| 132 // Store the horizontal and vertical width in the blob, keeping both | |
| 133 // widths if there is enough information, otherwise only the one with | |
| 134 // the most samples. | |
| 135 // If there are insufficient samples, store zero, rather than using | |
| 136 // 2*area/perimeter, as the numbers that gives do not match the numbers | |
| 137 // from the distance method. | |
| 138 if (h_stats.get_total() >= (width + height) / 4) { | |
| 139 blob->set_horz_stroke_width(h_stats.ile(0.5f)); | |
| 140 if (v_stats.get_total() >= (width + height) / 4) { | |
| 141 blob->set_vert_stroke_width(v_stats.ile(0.5f)); | |
| 142 } else { | |
| 143 blob->set_vert_stroke_width(0.0f); | |
| 144 } | |
| 145 } else { | |
| 146 if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) { | |
| 147 blob->set_horz_stroke_width(0.0f); | |
| 148 blob->set_vert_stroke_width(v_stats.ile(0.5f)); | |
| 149 } else { | |
| 150 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f); | |
| 151 blob->set_vert_stroke_width(0.0f); | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 /********************************************************************** | |
| 157 * assign_blobs_to_blocks2 | |
| 158 * | |
| 159 * Make a list of TO_BLOCKs for portrait and landscape orientation. | |
| 160 **********************************************************************/ | |
| 161 | |
| 162 void assign_blobs_to_blocks2(Image pix, | |
| 163 BLOCK_LIST *blocks, // blocks to process | |
| 164 TO_BLOCK_LIST *port_blocks) { // output list | |
| 165 BLOCK_IT block_it = blocks; | |
| 166 C_BLOB_IT blob_it; // iterator | |
| 167 BLOBNBOX_IT port_box_it; // iterator | |
| 168 // destination iterator | |
| 169 TO_BLOCK_IT port_block_it = port_blocks; | |
| 170 | |
| 171 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 172 auto block = block_it.data(); | |
| 173 auto port_block = new TO_BLOCK(block); | |
| 174 | |
| 175 // Convert the good outlines to block->blob_list | |
| 176 port_box_it.set_to_list(&port_block->blobs); | |
| 177 blob_it.set_to_list(block->blob_list()); | |
| 178 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 179 auto blob = blob_it.extract(); | |
| 180 auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. | |
| 181 newblob->set_owns_cblob(true); | |
| 182 SetBlobStrokeWidth(pix, newblob); | |
| 183 port_box_it.add_after_then_move(newblob); | |
| 184 } | |
| 185 | |
| 186 // Put the rejected outlines in block->noise_blobs, which allows them to | |
| 187 // be reconsidered and sorted back into rows and recover outlines mistakenly | |
| 188 // rejected. | |
| 189 port_box_it.set_to_list(&port_block->noise_blobs); | |
| 190 blob_it.set_to_list(block->reject_blobs()); | |
| 191 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 192 auto blob = blob_it.extract(); | |
| 193 auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. | |
| 194 newblob->set_owns_cblob(true); | |
| 195 SetBlobStrokeWidth(pix, newblob); | |
| 196 port_box_it.add_after_then_move(newblob); | |
| 197 } | |
| 198 | |
| 199 port_block_it.add_after_then_move(port_block); | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 /********************************************************************** | |
| 204 * find_components | |
| 205 * | |
| 206 * Find the C_OUTLINEs of the connected components in each block, put them | |
| 207 * in C_BLOBs, and filter them by size, putting the different size | |
| 208 * grades on different lists in the matching TO_BLOCK in to_blocks. | |
| 209 **********************************************************************/ | |
| 210 | |
| 211 void Textord::find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { | |
| 212 int width = pixGetWidth(pix); | |
| 213 int height = pixGetHeight(pix); | |
| 214 if (width > INT16_MAX || height > INT16_MAX) { | |
| 215 tprintf("Input image too large! (%d, %d)\n", width, height); | |
| 216 return; // Can't handle it. | |
| 217 } | |
| 218 | |
| 219 BLOCK_IT block_it(blocks); // iterator | |
| 220 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 221 BLOCK *block = block_it.data(); | |
| 222 if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) { | |
| 223 extract_edges(pix, block); | |
| 224 } | |
| 225 } | |
| 226 | |
| 227 assign_blobs_to_blocks2(pix, blocks, to_blocks); | |
| 228 ICOORD page_tr(width, height); | |
| 229 filter_blobs(page_tr, to_blocks, !textord_test_landscape); | |
| 230 } | |
| 231 | |
| 232 /********************************************************************** | |
| 233 * filter_blobs | |
| 234 * | |
| 235 * Sort the blobs into sizes in all the blocks for later work. | |
| 236 **********************************************************************/ | |
| 237 | |
| 238 void Textord::filter_blobs(ICOORD page_tr, // top right | |
| 239 TO_BLOCK_LIST *blocks, // output list | |
| 240 bool testing_on) { // for plotting | |
| 241 TO_BLOCK_IT block_it = blocks; // destination iterator | |
| 242 TO_BLOCK *block; // created block | |
| 243 | |
| 244 #ifndef GRAPHICS_DISABLED | |
| 245 if (to_win != nullptr) { | |
| 246 to_win->Clear(); | |
| 247 } | |
| 248 #endif // !GRAPHICS_DISABLED | |
| 249 | |
| 250 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 251 block = block_it.data(); | |
| 252 block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs, | |
| 253 &block->large_blobs); | |
| 254 if (block->line_size == 0) { | |
| 255 block->line_size = 1; | |
| 256 } | |
| 257 block->line_spacing = | |
| 258 block->line_size * | |
| 259 (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + | |
| 260 2 * tesseract::CCStruct::kAscenderFraction) / | |
| 261 tesseract::CCStruct::kXHeightFraction; | |
| 262 block->line_size *= textord_min_linesize; | |
| 263 block->max_blob_size = block->line_size * textord_excess_blobsize; | |
| 264 | |
| 265 #ifndef GRAPHICS_DISABLED | |
| 266 if (textord_show_blobs && testing_on) { | |
| 267 if (to_win == nullptr) { | |
| 268 create_to_win(page_tr); | |
| 269 } | |
| 270 block->plot_graded_blobs(to_win); | |
| 271 } | |
| 272 if (textord_show_boxes && testing_on) { | |
| 273 if (to_win == nullptr) { | |
| 274 create_to_win(page_tr); | |
| 275 } | |
| 276 plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); | |
| 277 plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); | |
| 278 plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); | |
| 279 plot_box_list(to_win, &block->blobs, ScrollView::WHITE); | |
| 280 } | |
| 281 #endif // !GRAPHICS_DISABLED | |
| 282 } | |
| 283 } | |
| 284 | |
| 285 /********************************************************************** | |
| 286 * filter_noise_blobs | |
| 287 * | |
| 288 * Move small blobs to a separate list. | |
| 289 **********************************************************************/ | |
| 290 | |
| 291 float Textord::filter_noise_blobs(BLOBNBOX_LIST *src_list, // original list | |
| 292 BLOBNBOX_LIST *noise_list, // noise list | |
| 293 BLOBNBOX_LIST *small_list, // small blobs | |
| 294 BLOBNBOX_LIST *large_list) { // large blobs | |
| 295 int16_t height; // height of blob | |
| 296 int16_t width; // of blob | |
| 297 BLOBNBOX *blob; // current blob | |
| 298 float initial_x; // first guess | |
| 299 BLOBNBOX_IT src_it = src_list; // iterators | |
| 300 BLOBNBOX_IT noise_it = noise_list; | |
| 301 BLOBNBOX_IT small_it = small_list; | |
| 302 BLOBNBOX_IT large_it = large_list; | |
| 303 STATS size_stats(0, MAX_NEAREST_DIST - 1); | |
| 304 // blob heights | |
| 305 float min_y; // size limits | |
| 306 float max_y; | |
| 307 float max_x; | |
| 308 float max_height; // of good blobs | |
| 309 | |
| 310 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { | |
| 311 blob = src_it.data(); | |
| 312 if (blob->bounding_box().height() < textord_max_noise_size) { | |
| 313 noise_it.add_after_then_move(src_it.extract()); | |
| 314 } else if (blob->enclosed_area() >= blob->bounding_box().height() * | |
| 315 blob->bounding_box().width() * | |
| 316 textord_noise_area_ratio) { | |
| 317 small_it.add_after_then_move(src_it.extract()); | |
| 318 } | |
| 319 } | |
| 320 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { | |
| 321 size_stats.add(src_it.data()->bounding_box().height(), 1); | |
| 322 } | |
| 323 initial_x = size_stats.ile(textord_initialx_ile); | |
| 324 max_y = ceil(initial_x * | |
| 325 (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + | |
| 326 2 * tesseract::CCStruct::kAscenderFraction) / | |
| 327 tesseract::CCStruct::kXHeightFraction); | |
| 328 min_y = std::floor(initial_x / 2); | |
| 329 max_x = ceil(initial_x * textord_width_limit); | |
| 330 small_it.move_to_first(); | |
| 331 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) { | |
| 332 height = small_it.data()->bounding_box().height(); | |
| 333 if (height > max_y) { | |
| 334 large_it.add_after_then_move(small_it.extract()); | |
| 335 } else if (height >= min_y) { | |
| 336 src_it.add_after_then_move(small_it.extract()); | |
| 337 } | |
| 338 } | |
| 339 size_stats.clear(); | |
| 340 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { | |
| 341 height = src_it.data()->bounding_box().height(); | |
| 342 width = src_it.data()->bounding_box().width(); | |
| 343 if (height < min_y) { | |
| 344 small_it.add_after_then_move(src_it.extract()); | |
| 345 } else if (height > max_y || width > max_x) { | |
| 346 large_it.add_after_then_move(src_it.extract()); | |
| 347 } else { | |
| 348 size_stats.add(height, 1); | |
| 349 } | |
| 350 } | |
| 351 max_height = size_stats.ile(textord_initialasc_ile); | |
| 352 // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", | |
| 353 // max_y,min_y,initial_x,max_height); | |
| 354 max_height *= tesseract::CCStruct::kXHeightCapRatio; | |
| 355 if (max_height > initial_x) { | |
| 356 initial_x = max_height; | |
| 357 } | |
| 358 // tprintf(" ret=%g\n",initial_x); | |
| 359 return initial_x; | |
| 360 } | |
| 361 | |
| 362 // Fixes the block so it obeys all the rules: | |
| 363 // Must have at least one ROW. | |
| 364 // Must have at least one WERD. | |
| 365 // WERDs contain a fake blob. | |
| 366 void Textord::cleanup_nontext_block(BLOCK *block) { | |
| 367 // Non-text blocks must contain at least one row. | |
| 368 ROW_IT row_it(block->row_list()); | |
| 369 if (row_it.empty()) { | |
| 370 const TBOX &box = block->pdblk.bounding_box(); | |
| 371 float height = box.height(); | |
| 372 int32_t xstarts[2] = {box.left(), box.right()}; | |
| 373 double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; | |
| 374 ROW *row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); | |
| 375 row_it.add_after_then_move(row); | |
| 376 } | |
| 377 // Each row must contain at least one word. | |
| 378 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 379 ROW *row = row_it.data(); | |
| 380 WERD_IT w_it(row->word_list()); | |
| 381 if (w_it.empty()) { | |
| 382 // Make a fake blob to put in the word. | |
| 383 TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); | |
| 384 C_BLOB *blob = C_BLOB::FakeBlob(box); | |
| 385 C_BLOB_LIST blobs; | |
| 386 C_BLOB_IT blob_it(&blobs); | |
| 387 blob_it.add_after_then_move(blob); | |
| 388 WERD *word = new WERD(&blobs, 0, nullptr); | |
| 389 w_it.add_after_then_move(word); | |
| 390 } | |
| 391 // Each word must contain a fake blob. | |
| 392 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 393 WERD *word = w_it.data(); | |
| 394 // Just assert that this is true, as it would be useful to find | |
| 395 // out why it isn't. | |
| 396 ASSERT_HOST(!word->cblob_list()->empty()); | |
| 397 } | |
| 398 row->recalc_bounding_box(); | |
| 399 } | |
| 400 } | |
| 401 | |
| 402 /********************************************************************** | |
| 403 * cleanup_blocks | |
| 404 * | |
| 405 * Delete empty blocks, rows from the page. | |
| 406 **********************************************************************/ | |
| 407 | |
| 408 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { | |
| 409 BLOCK_IT block_it = blocks; // iterator | |
| 410 ROW_IT row_it; // row iterator | |
| 411 | |
| 412 int num_rows = 0; | |
| 413 int num_rows_all = 0; | |
| 414 int num_blocks = 0; | |
| 415 int num_blocks_all = 0; | |
| 416 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 417 BLOCK *block = block_it.data(); | |
| 418 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { | |
| 419 cleanup_nontext_block(block); | |
| 420 continue; | |
| 421 } | |
| 422 num_rows = 0; | |
| 423 num_rows_all = 0; | |
| 424 if (clean_noise) { | |
| 425 row_it.set_to_list(block->row_list()); | |
| 426 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 427 ROW *row = row_it.data(); | |
| 428 ++num_rows_all; | |
| 429 clean_small_noise_from_words(row); | |
| 430 if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) || | |
| 431 row->word_list()->empty()) { | |
| 432 delete row_it.extract(); // lose empty row. | |
| 433 } else { | |
| 434 if (textord_noise_rejwords) { | |
| 435 clean_noise_from_words(row_it.data()); | |
| 436 } | |
| 437 if (textord_blshift_maxshift >= 0) { | |
| 438 tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); | |
| 439 } | |
| 440 ++num_rows; | |
| 441 } | |
| 442 } | |
| 443 } | |
| 444 if (block->row_list()->empty()) { | |
| 445 delete block_it.extract(); // Lose empty text blocks. | |
| 446 } else { | |
| 447 ++num_blocks; | |
| 448 } | |
| 449 ++num_blocks_all; | |
| 450 if (textord_noise_debug) { | |
| 451 tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); | |
| 452 } | |
| 453 } | |
| 454 if (textord_noise_debug) { | |
| 455 tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); | |
| 456 } | |
| 457 } | |
| 458 | |
| 459 /********************************************************************** | |
| 460 * clean_noise_from_row | |
| 461 * | |
| 462 * Move blobs of words from rows of garbage into the reject blobs list. | |
| 463 **********************************************************************/ | |
| 464 | |
| 465 bool Textord::clean_noise_from_row( // remove empties | |
| 466 ROW *row // row to clean | |
| 467 ) { | |
| 468 bool testing_on; | |
| 469 TBOX blob_box; // bounding box | |
| 470 C_BLOB *blob; // current blob | |
| 471 C_OUTLINE *outline; // current outline | |
| 472 WERD *word; // current word | |
| 473 int32_t blob_size; // biggest size | |
| 474 int32_t trans_count = 0; // no of transitions | |
| 475 int32_t trans_threshold; // noise tolerance | |
| 476 int32_t dot_count; // small objects | |
| 477 int32_t norm_count; // normal objects | |
| 478 int32_t super_norm_count; // real char-like | |
| 479 // words of row | |
| 480 WERD_IT word_it = row->word_list(); | |
| 481 C_BLOB_IT blob_it; // blob iterator | |
| 482 C_OUTLINE_IT out_it; // outline iterator | |
| 483 | |
| 484 testing_on = textord_test_y > row->base_line(textord_test_x) && textord_show_blobs && | |
| 485 textord_test_y < row->base_line(textord_test_x) + row->x_height(); | |
| 486 dot_count = 0; | |
| 487 norm_count = 0; | |
| 488 super_norm_count = 0; | |
| 489 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 490 word = word_it.data(); // current word | |
| 491 // blobs in word | |
| 492 blob_it.set_to_list(word->cblob_list()); | |
| 493 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 494 blob = blob_it.data(); | |
| 495 if (!word->flag(W_DONT_CHOP)) { | |
| 496 // get outlines | |
| 497 out_it.set_to_list(blob->out_list()); | |
| 498 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { | |
| 499 outline = out_it.data(); | |
| 500 blob_box = outline->bounding_box(); | |
| 501 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); | |
| 502 if (blob_size < textord_noise_sizelimit * row->x_height()) { | |
| 503 dot_count++; // count small outlines | |
| 504 } | |
| 505 if (!outline->child()->empty() && | |
| 506 blob_box.height() < (1 + textord_noise_syfract) * row->x_height() && | |
| 507 blob_box.height() > (1 - textord_noise_syfract) * row->x_height() && | |
| 508 blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() && | |
| 509 blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) { | |
| 510 super_norm_count++; // count small outlines | |
| 511 } | |
| 512 } | |
| 513 } else { | |
| 514 super_norm_count++; | |
| 515 } | |
| 516 blob_box = blob->bounding_box(); | |
| 517 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); | |
| 518 if (blob_size >= textord_noise_sizelimit * row->x_height() && | |
| 519 blob_size < row->x_height() * 2) { | |
| 520 trans_threshold = blob_size / textord_noise_sizefraction; | |
| 521 trans_count = blob->count_transitions(trans_threshold); | |
| 522 if (trans_count < textord_noise_translimit) { | |
| 523 norm_count++; | |
| 524 } | |
| 525 } else if (blob_box.height() > row->x_height() * 2 && | |
| 526 (!word_it.at_first() || !blob_it.at_first())) { | |
| 527 dot_count += 2; | |
| 528 } | |
| 529 if (testing_on) { | |
| 530 tprintf("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left(), | |
| 531 blob_box.bottom(), blob_box.right(), blob_box.top(), blob->out_list()->length(), | |
| 532 trans_count, blob_box.bottom() - row->base_line(blob_box.left())); | |
| 533 } | |
| 534 } | |
| 535 } | |
| 536 // TODO: check whether `&& super_norm_count < textord_noise_sncount`should always be added here. | |
| 537 bool rejected = dot_count > norm_count * textord_noise_normratio && | |
| 538 dot_count > 2; | |
| 539 if (textord_noise_debug) { | |
| 540 tprintf("Row ending at (%d,%g):", blob_box.right(), row->base_line(blob_box.right())); | |
| 541 tprintf(" R=%g, dc=%d, nc=%d, %s\n", | |
| 542 norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999, dot_count, | |
| 543 norm_count, | |
| 544 rejected? "REJECTED": "ACCEPTED"); | |
| 545 } | |
| 546 return super_norm_count < textord_noise_sncount && rejected; | |
| 547 } | |
| 548 | |
| 549 /********************************************************************** | |
| 550 * clean_noise_from_words | |
| 551 * | |
| 552 * Move blobs of words from rows of garbage into the reject blobs list. | |
| 553 **********************************************************************/ | |
| 554 | |
| 555 void Textord::clean_noise_from_words( // remove empties | |
| 556 ROW *row // row to clean | |
| 557 ) { | |
| 558 TBOX blob_box; // bounding box | |
| 559 C_BLOB *blob; // current blob | |
| 560 C_OUTLINE *outline; // current outline | |
| 561 WERD *word; // current word | |
| 562 int32_t blob_size; // biggest size | |
| 563 int32_t trans_count; // no of transitions | |
| 564 int32_t trans_threshold; // noise tolerance | |
| 565 int32_t dot_count; // small objects | |
| 566 int32_t norm_count; // normal objects | |
| 567 int32_t dud_words; // number discarded | |
| 568 int32_t ok_words; // number remaining | |
| 569 int32_t word_index; // current word | |
| 570 // words of row | |
| 571 WERD_IT word_it = row->word_list(); | |
| 572 C_BLOB_IT blob_it; // blob iterator | |
| 573 C_OUTLINE_IT out_it; // outline iterator | |
| 574 | |
| 575 ok_words = word_it.length(); | |
| 576 if (ok_words == 0 || textord_no_rejects) { | |
| 577 return; | |
| 578 } | |
| 579 // was it chucked | |
| 580 std::vector<int8_t> word_dud(ok_words); | |
| 581 dud_words = 0; | |
| 582 ok_words = 0; | |
| 583 word_index = 0; | |
| 584 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 585 word = word_it.data(); // current word | |
| 586 dot_count = 0; | |
| 587 norm_count = 0; | |
| 588 // blobs in word | |
| 589 blob_it.set_to_list(word->cblob_list()); | |
| 590 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 591 blob = blob_it.data(); | |
| 592 if (!word->flag(W_DONT_CHOP)) { | |
| 593 // get outlines | |
| 594 out_it.set_to_list(blob->out_list()); | |
| 595 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { | |
| 596 outline = out_it.data(); | |
| 597 blob_box = outline->bounding_box(); | |
| 598 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); | |
| 599 if (blob_size < textord_noise_sizelimit * row->x_height()) { | |
| 600 dot_count++; // count small outlines | |
| 601 } | |
| 602 if (!outline->child()->empty() && | |
| 603 blob_box.height() < (1 + textord_noise_syfract) * row->x_height() && | |
| 604 blob_box.height() > (1 - textord_noise_syfract) * row->x_height() && | |
| 605 blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() && | |
| 606 blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) { | |
| 607 norm_count++; // count small outlines | |
| 608 } | |
| 609 } | |
| 610 } else { | |
| 611 norm_count++; | |
| 612 } | |
| 613 blob_box = blob->bounding_box(); | |
| 614 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); | |
| 615 if (blob_size >= textord_noise_sizelimit * row->x_height() && | |
| 616 blob_size < row->x_height() * 2) { | |
| 617 trans_threshold = blob_size / textord_noise_sizefraction; | |
| 618 trans_count = blob->count_transitions(trans_threshold); | |
| 619 if (trans_count < textord_noise_translimit) { | |
| 620 norm_count++; | |
| 621 } | |
| 622 } else if (blob_box.height() > row->x_height() * 2 && | |
| 623 (!word_it.at_first() || !blob_it.at_first())) { | |
| 624 dot_count += 2; | |
| 625 } | |
| 626 } | |
| 627 if (dot_count > 2 && !word->flag(W_REP_CHAR)) { | |
| 628 if (dot_count > norm_count * textord_noise_normratio * 2) { | |
| 629 word_dud[word_index] = 2; | |
| 630 } else if (dot_count > norm_count * textord_noise_normratio) { | |
| 631 word_dud[word_index] = 1; | |
| 632 } else { | |
| 633 word_dud[word_index] = 0; | |
| 634 } | |
| 635 } else { | |
| 636 word_dud[word_index] = 0; | |
| 637 } | |
| 638 if (word_dud[word_index] == 2) { | |
| 639 dud_words++; | |
| 640 } else { | |
| 641 ok_words++; | |
| 642 } | |
| 643 word_index++; | |
| 644 } | |
| 645 | |
| 646 word_index = 0; | |
| 647 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 648 if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { | |
| 649 word = word_it.data(); // Current word. | |
| 650 // Previously we threw away the entire word. | |
| 651 // Now just aggressively throw all small blobs into the reject list, where | |
| 652 // the classifier can decide whether they are actually needed. | |
| 653 word->CleanNoise(textord_noise_sizelimit * row->x_height()); | |
| 654 } | |
| 655 word_index++; | |
| 656 } | |
| 657 } | |
| 658 | |
| 659 // Remove outlines that are a tiny fraction in either width or height | |
| 660 // of the word height. | |
| 661 void Textord::clean_small_noise_from_words(ROW *row) { | |
| 662 WERD_IT word_it(row->word_list()); | |
| 663 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 664 WERD *word = word_it.data(); | |
| 665 int min_size = static_cast<int>(textord_noise_hfract * word->bounding_box().height() + 0.5); | |
| 666 C_BLOB_IT blob_it(word->cblob_list()); | |
| 667 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 668 C_BLOB *blob = blob_it.data(); | |
| 669 C_OUTLINE_IT out_it(blob->out_list()); | |
| 670 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { | |
| 671 C_OUTLINE *outline = out_it.data(); | |
| 672 outline->RemoveSmallRecursive(min_size, &out_it); | |
| 673 } | |
| 674 if (blob->out_list()->empty()) { | |
| 675 delete blob_it.extract(); | |
| 676 } | |
| 677 } | |
| 678 if (word->cblob_list()->empty()) { | |
| 679 if (!word_it.at_last()) { | |
| 680 // The next word is no longer a fuzzy non space if it was before, | |
| 681 // since the word before is about to be deleted. | |
| 682 WERD *next_word = word_it.data_relative(1); | |
| 683 if (next_word->flag(W_FUZZY_NON)) { | |
| 684 next_word->set_flag(W_FUZZY_NON, false); | |
| 685 } | |
| 686 } | |
| 687 delete word_it.extract(); | |
| 688 } | |
| 689 } | |
| 690 } | |
| 691 | |
| 692 // Local struct to hold a group of blocks. | |
| 693 struct BlockGroup { | |
| 694 BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} | |
| 695 explicit BlockGroup(BLOCK *block) | |
| 696 : bounding_box(block->pdblk.bounding_box()) | |
| 697 , rotation(block->re_rotation()) | |
| 698 , angle(block->re_rotation().angle()) | |
| 699 , min_xheight(block->x_height()) { | |
| 700 blocks.push_back(block); | |
| 701 } | |
| 702 // Union of block bounding boxes. | |
| 703 TBOX bounding_box; | |
| 704 // Common rotation of the blocks. | |
| 705 FCOORD rotation; | |
| 706 // Angle of rotation. | |
| 707 float angle; | |
| 708 // Min xheight of the blocks. | |
| 709 float min_xheight; | |
| 710 // Collection of borrowed pointers to the blocks in the group. | |
| 711 std::vector<BLOCK *> blocks; | |
| 712 }; | |
| 713 | |
| 714 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls | |
| 715 // TransferDiacriticsToWords to copy the diacritic blobs to the most | |
| 716 // appropriate words in the group of blocks. Source blobs are not touched. | |
| 717 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks) { | |
| 718 // Angle difference larger than this is too much to consider equal. | |
| 719 // They should only be in multiples of M_PI/2 anyway. | |
| 720 const double kMaxAngleDiff = 0.01; // About 0.6 degrees. | |
| 721 std::vector<std::unique_ptr<BlockGroup>> groups; | |
| 722 BLOCK_IT bk_it(blocks); | |
| 723 for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { | |
| 724 BLOCK *block = bk_it.data(); | |
| 725 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { | |
| 726 continue; | |
| 727 } | |
| 728 // Linear search of the groups to find a matching rotation. | |
| 729 float block_angle = block->re_rotation().angle(); | |
| 730 int best_g = 0; | |
| 731 float best_angle_diff = FLT_MAX; | |
| 732 for (const auto &group : groups) { | |
| 733 double angle_diff = std::fabs(block_angle - group->angle); | |
| 734 if (angle_diff > M_PI) { | |
| 735 angle_diff = fabs(angle_diff - 2.0 * M_PI); | |
| 736 } | |
| 737 if (angle_diff < best_angle_diff) { | |
| 738 best_angle_diff = angle_diff; | |
| 739 best_g = &group - &groups[0]; | |
| 740 } | |
| 741 } | |
| 742 if (best_angle_diff > kMaxAngleDiff) { | |
| 743 groups.push_back(std::make_unique<BlockGroup>(block)); | |
| 744 } else { | |
| 745 groups[best_g]->blocks.push_back(block); | |
| 746 groups[best_g]->bounding_box += block->pdblk.bounding_box(); | |
| 747 float x_height = block->x_height(); | |
| 748 if (x_height < groups[best_g]->min_xheight) { | |
| 749 groups[best_g]->min_xheight = x_height; | |
| 750 } | |
| 751 } | |
| 752 } | |
| 753 // Now process each group of blocks. | |
| 754 std::vector<std::unique_ptr<WordWithBox>> word_ptrs; | |
| 755 for (const auto &group : groups) { | |
| 756 if (group->bounding_box.null_box()) { | |
| 757 continue; | |
| 758 } | |
| 759 WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), | |
| 760 group->bounding_box.topright()); | |
| 761 for (auto b : group->blocks) { | |
| 762 ROW_IT row_it(b->row_list()); | |
| 763 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 764 ROW *row = row_it.data(); | |
| 765 // Put the words of the row into the grid. | |
| 766 WERD_IT w_it(row->word_list()); | |
| 767 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 768 WERD *word = w_it.data(); | |
| 769 auto box_word = std::make_unique<WordWithBox>(word); | |
| 770 word_grid.InsertBBox(true, true, box_word.get()); | |
| 771 // Save the pointer where it will be auto-deleted. | |
| 772 word_ptrs.emplace_back(std::move(box_word)); | |
| 773 } | |
| 774 } | |
| 775 } | |
| 776 FCOORD rotation = group->rotation; | |
| 777 // Make it a forward rotation that will transform blob coords to block. | |
| 778 rotation.set_y(-rotation.y()); | |
| 779 TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); | |
| 780 } | |
| 781 } | |
| 782 | |
| 783 // Places a copy of blobs that are near a word (after applying rotation to the | |
| 784 // blob) in the most appropriate word, unless there is doubt, in which case a | |
| 785 // blob can end up in two words. Source blobs are not touched. | |
| 786 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation, | |
| 787 WordGrid *word_grid) { | |
| 788 WordSearch ws(word_grid); | |
| 789 BLOBNBOX_IT b_it(diacritic_blobs); | |
| 790 // Apply rotation to each blob before finding the nearest words. The rotation | |
| 791 // allows us to only consider above/below placement and not left/right on | |
| 792 // vertical text, because all text is horizontal here. | |
| 793 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 794 BLOBNBOX *blobnbox = b_it.data(); | |
| 795 TBOX blob_box = blobnbox->bounding_box(); | |
| 796 blob_box.rotate(rotation); | |
| 797 ws.StartRectSearch(blob_box); | |
| 798 // Above/below refer to word position relative to diacritic. Since some | |
| 799 // scripts eg Kannada/Telugu habitually put diacritics below words, and | |
| 800 // others eg Thai/Vietnamese/Latin put most diacritics above words, try | |
| 801 // for both if there isn't much in it. | |
| 802 WordWithBox *best_above_word = nullptr; | |
| 803 WordWithBox *best_below_word = nullptr; | |
| 804 int best_above_distance = 0; | |
| 805 int best_below_distance = 0; | |
| 806 for (WordWithBox *word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) { | |
| 807 if (word->word()->flag(W_REP_CHAR)) { | |
| 808 continue; | |
| 809 } | |
| 810 TBOX word_box = word->true_bounding_box(); | |
| 811 int x_distance = blob_box.x_gap(word_box); | |
| 812 int y_distance = blob_box.y_gap(word_box); | |
| 813 if (x_distance > 0) { | |
| 814 // Arbitrarily divide x-distance by 2 if there is a major y overlap, | |
| 815 // and the word is to the left of the diacritic. If the | |
| 816 // diacritic is a dropped broken character between two words, this will | |
| 817 // help send all the pieces to a single word, instead of splitting them | |
| 818 // over the 2 words. | |
| 819 if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) { | |
| 820 x_distance /= 2; | |
| 821 } | |
| 822 y_distance += x_distance; | |
| 823 } | |
| 824 if (word_box.y_middle() > blob_box.y_middle() && | |
| 825 (best_above_word == nullptr || y_distance < best_above_distance)) { | |
| 826 best_above_word = word; | |
| 827 best_above_distance = y_distance; | |
| 828 } | |
| 829 if (word_box.y_middle() <= blob_box.y_middle() && | |
| 830 (best_below_word == nullptr || y_distance < best_below_distance)) { | |
| 831 best_below_word = word; | |
| 832 best_below_distance = y_distance; | |
| 833 } | |
| 834 } | |
| 835 bool above_good = best_above_word != nullptr && | |
| 836 (best_below_word == nullptr || | |
| 837 best_above_distance < best_below_distance + blob_box.height()); | |
| 838 bool below_good = best_below_word != nullptr && best_below_word != best_above_word && | |
| 839 (best_above_word == nullptr || | |
| 840 best_below_distance < best_above_distance + blob_box.height()); | |
| 841 if (below_good) { | |
| 842 C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); | |
| 843 copied_blob->rotate(rotation); | |
| 844 // Put the blob into the word's reject blobs list. | |
| 845 C_BLOB_IT blob_it(best_below_word->RejBlobs()); | |
| 846 blob_it.add_to_end(copied_blob); | |
| 847 } | |
| 848 if (above_good) { | |
| 849 C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); | |
| 850 copied_blob->rotate(rotation); | |
| 851 // Put the blob into the word's reject blobs list. | |
| 852 C_BLOB_IT blob_it(best_above_word->RejBlobs()); | |
| 853 blob_it.add_to_end(copied_blob); | |
| 854 } | |
| 855 } | |
| 856 } | |
| 857 | |
| 858 /********************************************************************** | |
| 859 * tweak_row_baseline | |
| 860 * | |
| 861 * Shift baseline to fit the blobs more accurately where they are | |
| 862 * close enough. | |
| 863 **********************************************************************/ | |
| 864 | |
| 865 void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) { | |
| 866 TBOX blob_box; // bounding box | |
| 867 C_BLOB *blob; // current blob | |
| 868 WERD *word; // current word | |
| 869 int32_t blob_count; // no of blobs | |
| 870 int32_t src_index; // source segment | |
| 871 int32_t dest_index; // destination segment | |
| 872 float ydiff; // baseline error | |
| 873 float x_centre; // centre of blob | |
| 874 // words of row | |
| 875 WERD_IT word_it = row->word_list(); | |
| 876 C_BLOB_IT blob_it; // blob iterator | |
| 877 | |
| 878 blob_count = 0; | |
| 879 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 880 word = word_it.data(); // current word | |
| 881 // get total blobs | |
| 882 blob_count += word->cblob_list()->length(); | |
| 883 } | |
| 884 if (blob_count == 0) { | |
| 885 return; | |
| 886 } | |
| 887 // spline segments | |
| 888 std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1); | |
| 889 // spline coeffs | |
| 890 std::vector<double> coeffs((blob_count + row->baseline.segments) * 3); | |
| 891 | |
| 892 src_index = 0; | |
| 893 dest_index = 0; | |
| 894 xstarts[0] = row->baseline.xcoords[0]; | |
| 895 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 896 word = word_it.data(); // current word | |
| 897 // blobs in word | |
| 898 blob_it.set_to_list(word->cblob_list()); | |
| 899 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 900 blob = blob_it.data(); | |
| 901 blob_box = blob->bounding_box(); | |
| 902 x_centre = (blob_box.left() + blob_box.right()) / 2.0; | |
| 903 ydiff = blob_box.bottom() - row->base_line(x_centre); | |
| 904 if (ydiff < 0) { | |
| 905 ydiff = -ydiff / row->x_height(); | |
| 906 } else { | |
| 907 ydiff = ydiff / row->x_height(); | |
| 908 } | |
| 909 if (ydiff < blshift_maxshift && blob_box.height() / row->x_height() > blshift_xfraction) { | |
| 910 if (xstarts[dest_index] >= x_centre) { | |
| 911 xstarts[dest_index] = blob_box.left(); | |
| 912 } | |
| 913 coeffs[dest_index * 3] = 0; | |
| 914 coeffs[dest_index * 3 + 1] = 0; | |
| 915 coeffs[dest_index * 3 + 2] = blob_box.bottom(); | |
| 916 // shift it | |
| 917 dest_index++; | |
| 918 xstarts[dest_index] = blob_box.right() + 1; | |
| 919 } else { | |
| 920 if (xstarts[dest_index] <= x_centre) { | |
| 921 while (row->baseline.xcoords[src_index + 1] <= x_centre && | |
| 922 src_index < row->baseline.segments - 1) { | |
| 923 if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) { | |
| 924 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; | |
| 925 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; | |
| 926 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; | |
| 927 dest_index++; | |
| 928 xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; | |
| 929 } | |
| 930 src_index++; | |
| 931 } | |
| 932 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; | |
| 933 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; | |
| 934 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; | |
| 935 dest_index++; | |
| 936 xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; | |
| 937 } | |
| 938 } | |
| 939 } | |
| 940 } | |
| 941 while (src_index < row->baseline.segments && | |
| 942 row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) { | |
| 943 src_index++; | |
| 944 } | |
| 945 while (src_index < row->baseline.segments) { | |
| 946 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; | |
| 947 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; | |
| 948 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; | |
| 949 dest_index++; | |
| 950 src_index++; | |
| 951 xstarts[dest_index] = row->baseline.xcoords[src_index]; | |
| 952 } | |
| 953 // turn to spline | |
| 954 row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]); | |
| 955 } | |
| 956 | |
| 957 } // namespace tesseract |
