Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/tordmain.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/tordmain.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,957 @@ +/********************************************************************** + * File: tordmain.cpp (Formerly textordp.c) + * Description: C++ top level textord code. + * Author: Ray Smith + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#define _USE_MATH_DEFINES // for M_PI + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "tordmain.h" + +#include "arrayaccess.h" // for GET_DATA_BYTE +#include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B... +#include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction +#include "clst.h" // for CLISTIZE +#include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE +#include "drawtord.h" // for plot_box_list, to_win, create_to_win +#include "edgblob.h" // for extract_edges +#include "errcode.h" // for ASSERT_HOST, ... +#include "makerow.h" // for textord_test_x, textord_test_y, texto... +#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only) +#include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base... +#include "params.h" // for DoubleParam, BoolParam, IntParam +#include "pdblock.h" // for PDBLK +#include "points.h" // for FCOORD, ICOORD +#include "polyblk.h" // for POLY_BLOCK +#include "quadratc.h" // for QUAD_COEFFS +#include "quspline.h" // for QSPLINE, tweak_row_baseline +#include "rect.h" // for TBOX +#include "scrollview.h" // for ScrollView, ScrollView::WHITE +#include "statistc.h" // for STATS +#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST +#include "textord.h" // for Textord, WordWithBox, WordGrid, WordS... +#include "tprintf.h" // for tprintf +#include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP + +#include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate + +#include <cfloat> // for FLT_MAX +#include <cmath> // for ceil, floor, M_PI +#include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t +#include <memory> + +namespace tesseract { + +#define MAX_NEAREST_DIST 600 // for block skew stats + +/********************************************************************** + * SetBlobStrokeWidth + * + * Set the horizontal and vertical stroke widths in the blob. + **********************************************************************/ +void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob) { + // Cut the blob rectangle into a Pix. + int pix_height = pixGetHeight(pix); + const TBOX &box = blob->bounding_box(); + int width = box.width(); + int height = box.height(); + Box *blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height); + Image pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr); + boxDestroy(&blob_pix_box); + Image dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); + pix_blob.destroy(); + // Compute the stroke widths. + uint32_t *data = pixGetData(dist_pix); + int wpl = pixGetWpl(dist_pix); + // Horizontal width of stroke. + STATS h_stats(0, width); + for (int y = 0; y < height; ++y) { + uint32_t *pixels = data + y * wpl; + int prev_pixel = 0; + int pixel = GET_DATA_BYTE(pixels, 0); + for (int x = 1; x < width; ++x) { + int next_pixel = GET_DATA_BYTE(pixels, x); + // We are looking for a pixel that is equal to its vertical neighbours, + // yet greater than its left neighbour. + if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && + (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { + if (pixel > next_pixel) { + // Single local max, so an odd width. + h_stats.add(pixel * 2 - 1, 1); + } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) { + // Double local max, so an even width. + h_stats.add(pixel * 2, 1); + } + } + prev_pixel = pixel; + pixel = next_pixel; + } + } + // Vertical width of stroke. + STATS v_stats(0, height); + for (int x = 0; x < width; ++x) { + int prev_pixel = 0; + int pixel = GET_DATA_BYTE(data, x); + for (int y = 1; y < height; ++y) { + uint32_t *pixels = data + y * wpl; + int next_pixel = GET_DATA_BYTE(pixels, x); + // We are looking for a pixel that is equal to its horizontal neighbours, + // yet greater than its upper neighbour. + if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && + (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { + if (pixel > next_pixel) { + // Single local max, so an odd width. + v_stats.add(pixel * 2 - 1, 1); + } else if (pixel == next_pixel && y + 1 < height && + pixel > GET_DATA_BYTE(pixels + wpl, x)) { + // Double local max, so an even width. + v_stats.add(pixel * 2, 1); + } + } + prev_pixel = pixel; + pixel = next_pixel; + } + } + dist_pix.destroy(); + // Store the horizontal and vertical width in the blob, keeping both + // widths if there is enough information, otherwise only the one with + // the most samples. + // If there are insufficient samples, store zero, rather than using + // 2*area/perimeter, as the numbers that gives do not match the numbers + // from the distance method. + if (h_stats.get_total() >= (width + height) / 4) { + blob->set_horz_stroke_width(h_stats.ile(0.5f)); + if (v_stats.get_total() >= (width + height) / 4) { + blob->set_vert_stroke_width(v_stats.ile(0.5f)); + } else { + blob->set_vert_stroke_width(0.0f); + } + } else { + if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) { + blob->set_horz_stroke_width(0.0f); + blob->set_vert_stroke_width(v_stats.ile(0.5f)); + } else { + blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f); + blob->set_vert_stroke_width(0.0f); + } + } +} + +/********************************************************************** + * assign_blobs_to_blocks2 + * + * Make a list of TO_BLOCKs for portrait and landscape orientation. + **********************************************************************/ + +void assign_blobs_to_blocks2(Image pix, + BLOCK_LIST *blocks, // blocks to process + TO_BLOCK_LIST *port_blocks) { // output list + BLOCK_IT block_it = blocks; + C_BLOB_IT blob_it; // iterator + BLOBNBOX_IT port_box_it; // iterator + // destination iterator + TO_BLOCK_IT port_block_it = port_blocks; + + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + auto block = block_it.data(); + auto port_block = new TO_BLOCK(block); + + // Convert the good outlines to block->blob_list + port_box_it.set_to_list(&port_block->blobs); + blob_it.set_to_list(block->blob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + auto blob = blob_it.extract(); + auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. + newblob->set_owns_cblob(true); + SetBlobStrokeWidth(pix, newblob); + port_box_it.add_after_then_move(newblob); + } + + // Put the rejected outlines in block->noise_blobs, which allows them to + // be reconsidered and sorted back into rows and recover outlines mistakenly + // rejected. + port_box_it.set_to_list(&port_block->noise_blobs); + blob_it.set_to_list(block->reject_blobs()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + auto blob = blob_it.extract(); + auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. + newblob->set_owns_cblob(true); + SetBlobStrokeWidth(pix, newblob); + port_box_it.add_after_then_move(newblob); + } + + port_block_it.add_after_then_move(port_block); + } +} + +/********************************************************************** + * find_components + * + * Find the C_OUTLINEs of the connected components in each block, put them + * in C_BLOBs, and filter them by size, putting the different size + * grades on different lists in the matching TO_BLOCK in to_blocks. + **********************************************************************/ + +void Textord::find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + if (width > INT16_MAX || height > INT16_MAX) { + tprintf("Input image too large! (%d, %d)\n", width, height); + return; // Can't handle it. + } + + BLOCK_IT block_it(blocks); // iterator + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + BLOCK *block = block_it.data(); + if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) { + extract_edges(pix, block); + } + } + + assign_blobs_to_blocks2(pix, blocks, to_blocks); + ICOORD page_tr(width, height); + filter_blobs(page_tr, to_blocks, !textord_test_landscape); +} + +/********************************************************************** + * filter_blobs + * + * Sort the blobs into sizes in all the blocks for later work. + **********************************************************************/ + +void Textord::filter_blobs(ICOORD page_tr, // top right + TO_BLOCK_LIST *blocks, // output list + bool testing_on) { // for plotting + TO_BLOCK_IT block_it = blocks; // destination iterator + TO_BLOCK *block; // created block + +#ifndef GRAPHICS_DISABLED + if (to_win != nullptr) { + to_win->Clear(); + } +#endif // !GRAPHICS_DISABLED + + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs, + &block->large_blobs); + if (block->line_size == 0) { + block->line_size = 1; + } + block->line_spacing = + block->line_size * + (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + + 2 * tesseract::CCStruct::kAscenderFraction) / + tesseract::CCStruct::kXHeightFraction; + block->line_size *= textord_min_linesize; + block->max_blob_size = block->line_size * textord_excess_blobsize; + +#ifndef GRAPHICS_DISABLED + if (textord_show_blobs && testing_on) { + if (to_win == nullptr) { + create_to_win(page_tr); + } + block->plot_graded_blobs(to_win); + } + if (textord_show_boxes && testing_on) { + if (to_win == nullptr) { + create_to_win(page_tr); + } + plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); + plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); + plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); + plot_box_list(to_win, &block->blobs, ScrollView::WHITE); + } +#endif // !GRAPHICS_DISABLED + } +} + +/********************************************************************** + * filter_noise_blobs + * + * Move small blobs to a separate list. + **********************************************************************/ + +float Textord::filter_noise_blobs(BLOBNBOX_LIST *src_list, // original list + BLOBNBOX_LIST *noise_list, // noise list + BLOBNBOX_LIST *small_list, // small blobs + BLOBNBOX_LIST *large_list) { // large blobs + int16_t height; // height of blob + int16_t width; // of blob + BLOBNBOX *blob; // current blob + float initial_x; // first guess + BLOBNBOX_IT src_it = src_list; // iterators + BLOBNBOX_IT noise_it = noise_list; + BLOBNBOX_IT small_it = small_list; + BLOBNBOX_IT large_it = large_list; + STATS size_stats(0, MAX_NEAREST_DIST - 1); + // blob heights + float min_y; // size limits + float max_y; + float max_x; + float max_height; // of good blobs + + for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { + blob = src_it.data(); + if (blob->bounding_box().height() < textord_max_noise_size) { + noise_it.add_after_then_move(src_it.extract()); + } else if (blob->enclosed_area() >= blob->bounding_box().height() * + blob->bounding_box().width() * + textord_noise_area_ratio) { + small_it.add_after_then_move(src_it.extract()); + } + } + for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { + size_stats.add(src_it.data()->bounding_box().height(), 1); + } + initial_x = size_stats.ile(textord_initialx_ile); + max_y = ceil(initial_x * + (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + + 2 * tesseract::CCStruct::kAscenderFraction) / + tesseract::CCStruct::kXHeightFraction); + min_y = std::floor(initial_x / 2); + max_x = ceil(initial_x * textord_width_limit); + small_it.move_to_first(); + for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) { + height = small_it.data()->bounding_box().height(); + if (height > max_y) { + large_it.add_after_then_move(small_it.extract()); + } else if (height >= min_y) { + src_it.add_after_then_move(small_it.extract()); + } + } + size_stats.clear(); + for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { + height = src_it.data()->bounding_box().height(); + width = src_it.data()->bounding_box().width(); + if (height < min_y) { + small_it.add_after_then_move(src_it.extract()); + } else if (height > max_y || width > max_x) { + large_it.add_after_then_move(src_it.extract()); + } else { + size_stats.add(height, 1); + } + } + max_height = size_stats.ile(textord_initialasc_ile); + // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", + // max_y,min_y,initial_x,max_height); + max_height *= tesseract::CCStruct::kXHeightCapRatio; + if (max_height > initial_x) { + initial_x = max_height; + } + // tprintf(" ret=%g\n",initial_x); + return initial_x; +} + +// Fixes the block so it obeys all the rules: +// Must have at least one ROW. +// Must have at least one WERD. +// WERDs contain a fake blob. +void Textord::cleanup_nontext_block(BLOCK *block) { + // Non-text blocks must contain at least one row. + ROW_IT row_it(block->row_list()); + if (row_it.empty()) { + const TBOX &box = block->pdblk.bounding_box(); + float height = box.height(); + int32_t xstarts[2] = {box.left(), box.right()}; + double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; + ROW *row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); + row_it.add_after_then_move(row); + } + // Each row must contain at least one word. + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW *row = row_it.data(); + WERD_IT w_it(row->word_list()); + if (w_it.empty()) { + // Make a fake blob to put in the word. + TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); + C_BLOB *blob = C_BLOB::FakeBlob(box); + C_BLOB_LIST blobs; + C_BLOB_IT blob_it(&blobs); + blob_it.add_after_then_move(blob); + WERD *word = new WERD(&blobs, 0, nullptr); + w_it.add_after_then_move(word); + } + // Each word must contain a fake blob. + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD *word = w_it.data(); + // Just assert that this is true, as it would be useful to find + // out why it isn't. + ASSERT_HOST(!word->cblob_list()->empty()); + } + row->recalc_bounding_box(); + } +} + +/********************************************************************** + * cleanup_blocks + * + * Delete empty blocks, rows from the page. + **********************************************************************/ + +void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { + BLOCK_IT block_it = blocks; // iterator + ROW_IT row_it; // row iterator + + int num_rows = 0; + int num_rows_all = 0; + int num_blocks = 0; + int num_blocks_all = 0; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + BLOCK *block = block_it.data(); + if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { + cleanup_nontext_block(block); + continue; + } + num_rows = 0; + num_rows_all = 0; + if (clean_noise) { + row_it.set_to_list(block->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW *row = row_it.data(); + ++num_rows_all; + clean_small_noise_from_words(row); + if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) || + row->word_list()->empty()) { + delete row_it.extract(); // lose empty row. + } else { + if (textord_noise_rejwords) { + clean_noise_from_words(row_it.data()); + } + if (textord_blshift_maxshift >= 0) { + tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); + } + ++num_rows; + } + } + } + if (block->row_list()->empty()) { + delete block_it.extract(); // Lose empty text blocks. + } else { + ++num_blocks; + } + ++num_blocks_all; + if (textord_noise_debug) { + tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); + } + } + if (textord_noise_debug) { + tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); + } +} + +/********************************************************************** + * clean_noise_from_row + * + * Move blobs of words from rows of garbage into the reject blobs list. + **********************************************************************/ + +bool Textord::clean_noise_from_row( // remove empties + ROW *row // row to clean +) { + bool testing_on; + TBOX blob_box; // bounding box + C_BLOB *blob; // current blob + C_OUTLINE *outline; // current outline + WERD *word; // current word + int32_t blob_size; // biggest size + int32_t trans_count = 0; // no of transitions + int32_t trans_threshold; // noise tolerance + int32_t dot_count; // small objects + int32_t norm_count; // normal objects + int32_t super_norm_count; // real char-like + // words of row + WERD_IT word_it = row->word_list(); + C_BLOB_IT blob_it; // blob iterator + C_OUTLINE_IT out_it; // outline iterator + + testing_on = textord_test_y > row->base_line(textord_test_x) && textord_show_blobs && + textord_test_y < row->base_line(textord_test_x) + row->x_height(); + dot_count = 0; + norm_count = 0; + super_norm_count = 0; + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); // current word + // blobs in word + blob_it.set_to_list(word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + if (!word->flag(W_DONT_CHOP)) { + // get outlines + out_it.set_to_list(blob->out_list()); + for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { + outline = out_it.data(); + blob_box = outline->bounding_box(); + blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); + if (blob_size < textord_noise_sizelimit * row->x_height()) { + dot_count++; // count small outlines + } + if (!outline->child()->empty() && + blob_box.height() < (1 + textord_noise_syfract) * row->x_height() && + blob_box.height() > (1 - textord_noise_syfract) * row->x_height() && + blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() && + blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) { + super_norm_count++; // count small outlines + } + } + } else { + super_norm_count++; + } + blob_box = blob->bounding_box(); + blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); + if (blob_size >= textord_noise_sizelimit * row->x_height() && + blob_size < row->x_height() * 2) { + trans_threshold = blob_size / textord_noise_sizefraction; + trans_count = blob->count_transitions(trans_threshold); + if (trans_count < textord_noise_translimit) { + norm_count++; + } + } else if (blob_box.height() > row->x_height() * 2 && + (!word_it.at_first() || !blob_it.at_first())) { + dot_count += 2; + } + if (testing_on) { + tprintf("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left(), + blob_box.bottom(), blob_box.right(), blob_box.top(), blob->out_list()->length(), + trans_count, blob_box.bottom() - row->base_line(blob_box.left())); + } + } + } + // TODO: check whether `&& super_norm_count < textord_noise_sncount`should always be added here. + bool rejected = dot_count > norm_count * textord_noise_normratio && + dot_count > 2; + if (textord_noise_debug) { + tprintf("Row ending at (%d,%g):", blob_box.right(), row->base_line(blob_box.right())); + tprintf(" R=%g, dc=%d, nc=%d, %s\n", + norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999, dot_count, + norm_count, + rejected? "REJECTED": "ACCEPTED"); + } + return super_norm_count < textord_noise_sncount && rejected; +} + +/********************************************************************** + * clean_noise_from_words + * + * Move blobs of words from rows of garbage into the reject blobs list. + **********************************************************************/ + +void Textord::clean_noise_from_words( // remove empties + ROW *row // row to clean +) { + TBOX blob_box; // bounding box + C_BLOB *blob; // current blob + C_OUTLINE *outline; // current outline + WERD *word; // current word + int32_t blob_size; // biggest size + int32_t trans_count; // no of transitions + int32_t trans_threshold; // noise tolerance + int32_t dot_count; // small objects + int32_t norm_count; // normal objects + int32_t dud_words; // number discarded + int32_t ok_words; // number remaining + int32_t word_index; // current word + // words of row + WERD_IT word_it = row->word_list(); + C_BLOB_IT blob_it; // blob iterator + C_OUTLINE_IT out_it; // outline iterator + + ok_words = word_it.length(); + if (ok_words == 0 || textord_no_rejects) { + return; + } + // was it chucked + std::vector<int8_t> word_dud(ok_words); + dud_words = 0; + ok_words = 0; + word_index = 0; + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); // current word + dot_count = 0; + norm_count = 0; + // blobs in word + blob_it.set_to_list(word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + if (!word->flag(W_DONT_CHOP)) { + // get outlines + out_it.set_to_list(blob->out_list()); + for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { + outline = out_it.data(); + blob_box = outline->bounding_box(); + blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); + if (blob_size < textord_noise_sizelimit * row->x_height()) { + dot_count++; // count small outlines + } + if (!outline->child()->empty() && + blob_box.height() < (1 + textord_noise_syfract) * row->x_height() && + blob_box.height() > (1 - textord_noise_syfract) * row->x_height() && + blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() && + blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) { + norm_count++; // count small outlines + } + } + } else { + norm_count++; + } + blob_box = blob->bounding_box(); + blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); + if (blob_size >= textord_noise_sizelimit * row->x_height() && + blob_size < row->x_height() * 2) { + trans_threshold = blob_size / textord_noise_sizefraction; + trans_count = blob->count_transitions(trans_threshold); + if (trans_count < textord_noise_translimit) { + norm_count++; + } + } else if (blob_box.height() > row->x_height() * 2 && + (!word_it.at_first() || !blob_it.at_first())) { + dot_count += 2; + } + } + if (dot_count > 2 && !word->flag(W_REP_CHAR)) { + if (dot_count > norm_count * textord_noise_normratio * 2) { + word_dud[word_index] = 2; + } else if (dot_count > norm_count * textord_noise_normratio) { + word_dud[word_index] = 1; + } else { + word_dud[word_index] = 0; + } + } else { + word_dud[word_index] = 0; + } + if (word_dud[word_index] == 2) { + dud_words++; + } else { + ok_words++; + } + word_index++; + } + + word_index = 0; + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { + word = word_it.data(); // Current word. + // Previously we threw away the entire word. + // Now just aggressively throw all small blobs into the reject list, where + // the classifier can decide whether they are actually needed. + word->CleanNoise(textord_noise_sizelimit * row->x_height()); + } + word_index++; + } +} + +// Remove outlines that are a tiny fraction in either width or height +// of the word height. +void Textord::clean_small_noise_from_words(ROW *row) { + WERD_IT word_it(row->word_list()); + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + WERD *word = word_it.data(); + int min_size = static_cast<int>(textord_noise_hfract * word->bounding_box().height() + 0.5); + C_BLOB_IT blob_it(word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB *blob = blob_it.data(); + C_OUTLINE_IT out_it(blob->out_list()); + for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { + C_OUTLINE *outline = out_it.data(); + outline->RemoveSmallRecursive(min_size, &out_it); + } + if (blob->out_list()->empty()) { + delete blob_it.extract(); + } + } + if (word->cblob_list()->empty()) { + if (!word_it.at_last()) { + // The next word is no longer a fuzzy non space if it was before, + // since the word before is about to be deleted. + WERD *next_word = word_it.data_relative(1); + if (next_word->flag(W_FUZZY_NON)) { + next_word->set_flag(W_FUZZY_NON, false); + } + } + delete word_it.extract(); + } + } +} + +// Local struct to hold a group of blocks. +struct BlockGroup { + BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} + explicit BlockGroup(BLOCK *block) + : bounding_box(block->pdblk.bounding_box()) + , rotation(block->re_rotation()) + , angle(block->re_rotation().angle()) + , min_xheight(block->x_height()) { + blocks.push_back(block); + } + // Union of block bounding boxes. + TBOX bounding_box; + // Common rotation of the blocks. + FCOORD rotation; + // Angle of rotation. + float angle; + // Min xheight of the blocks. + float min_xheight; + // Collection of borrowed pointers to the blocks in the group. + std::vector<BLOCK *> blocks; +}; + +// Groups blocks by rotation, then, for each group, makes a WordGrid and calls +// TransferDiacriticsToWords to copy the diacritic blobs to the most +// appropriate words in the group of blocks. Source blobs are not touched. +void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks) { + // Angle difference larger than this is too much to consider equal. + // They should only be in multiples of M_PI/2 anyway. + const double kMaxAngleDiff = 0.01; // About 0.6 degrees. + std::vector<std::unique_ptr<BlockGroup>> groups; + BLOCK_IT bk_it(blocks); + for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { + BLOCK *block = bk_it.data(); + if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { + continue; + } + // Linear search of the groups to find a matching rotation. + float block_angle = block->re_rotation().angle(); + int best_g = 0; + float best_angle_diff = FLT_MAX; + for (const auto &group : groups) { + double angle_diff = std::fabs(block_angle - group->angle); + if (angle_diff > M_PI) { + angle_diff = fabs(angle_diff - 2.0 * M_PI); + } + if (angle_diff < best_angle_diff) { + best_angle_diff = angle_diff; + best_g = &group - &groups[0]; + } + } + if (best_angle_diff > kMaxAngleDiff) { + groups.push_back(std::make_unique<BlockGroup>(block)); + } else { + groups[best_g]->blocks.push_back(block); + groups[best_g]->bounding_box += block->pdblk.bounding_box(); + float x_height = block->x_height(); + if (x_height < groups[best_g]->min_xheight) { + groups[best_g]->min_xheight = x_height; + } + } + } + // Now process each group of blocks. + std::vector<std::unique_ptr<WordWithBox>> word_ptrs; + for (const auto &group : groups) { + if (group->bounding_box.null_box()) { + continue; + } + WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), + group->bounding_box.topright()); + for (auto b : group->blocks) { + ROW_IT row_it(b->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW *row = row_it.data(); + // Put the words of the row into the grid. + WERD_IT w_it(row->word_list()); + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD *word = w_it.data(); + auto box_word = std::make_unique<WordWithBox>(word); + word_grid.InsertBBox(true, true, box_word.get()); + // Save the pointer where it will be auto-deleted. + word_ptrs.emplace_back(std::move(box_word)); + } + } + } + FCOORD rotation = group->rotation; + // Make it a forward rotation that will transform blob coords to block. + rotation.set_y(-rotation.y()); + TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); + } +} + +// Places a copy of blobs that are near a word (after applying rotation to the +// blob) in the most appropriate word, unless there is doubt, in which case a +// blob can end up in two words. Source blobs are not touched. +void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation, + WordGrid *word_grid) { + WordSearch ws(word_grid); + BLOBNBOX_IT b_it(diacritic_blobs); + // Apply rotation to each blob before finding the nearest words. The rotation + // allows us to only consider above/below placement and not left/right on + // vertical text, because all text is horizontal here. + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOBNBOX *blobnbox = b_it.data(); + TBOX blob_box = blobnbox->bounding_box(); + blob_box.rotate(rotation); + ws.StartRectSearch(blob_box); + // Above/below refer to word position relative to diacritic. Since some + // scripts eg Kannada/Telugu habitually put diacritics below words, and + // others eg Thai/Vietnamese/Latin put most diacritics above words, try + // for both if there isn't much in it. + WordWithBox *best_above_word = nullptr; + WordWithBox *best_below_word = nullptr; + int best_above_distance = 0; + int best_below_distance = 0; + for (WordWithBox *word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) { + if (word->word()->flag(W_REP_CHAR)) { + continue; + } + TBOX word_box = word->true_bounding_box(); + int x_distance = blob_box.x_gap(word_box); + int y_distance = blob_box.y_gap(word_box); + if (x_distance > 0) { + // Arbitrarily divide x-distance by 2 if there is a major y overlap, + // and the word is to the left of the diacritic. If the + // diacritic is a dropped broken character between two words, this will + // help send all the pieces to a single word, instead of splitting them + // over the 2 words. + if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) { + x_distance /= 2; + } + y_distance += x_distance; + } + if (word_box.y_middle() > blob_box.y_middle() && + (best_above_word == nullptr || y_distance < best_above_distance)) { + best_above_word = word; + best_above_distance = y_distance; + } + if (word_box.y_middle() <= blob_box.y_middle() && + (best_below_word == nullptr || y_distance < best_below_distance)) { + best_below_word = word; + best_below_distance = y_distance; + } + } + bool above_good = best_above_word != nullptr && + (best_below_word == nullptr || + best_above_distance < best_below_distance + blob_box.height()); + bool below_good = best_below_word != nullptr && best_below_word != best_above_word && + (best_above_word == nullptr || + best_below_distance < best_above_distance + blob_box.height()); + if (below_good) { + C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_below_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + if (above_good) { + C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_above_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + } +} + +/********************************************************************** + * tweak_row_baseline + * + * Shift baseline to fit the blobs more accurately where they are + * close enough. + **********************************************************************/ + +void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) { + TBOX blob_box; // bounding box + C_BLOB *blob; // current blob + WERD *word; // current word + int32_t blob_count; // no of blobs + int32_t src_index; // source segment + int32_t dest_index; // destination segment + float ydiff; // baseline error + float x_centre; // centre of blob + // words of row + WERD_IT word_it = row->word_list(); + C_BLOB_IT blob_it; // blob iterator + + blob_count = 0; + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); // current word + // get total blobs + blob_count += word->cblob_list()->length(); + } + if (blob_count == 0) { + return; + } + // spline segments + std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1); + // spline coeffs + std::vector<double> coeffs((blob_count + row->baseline.segments) * 3); + + src_index = 0; + dest_index = 0; + xstarts[0] = row->baseline.xcoords[0]; + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); // current word + // blobs in word + blob_it.set_to_list(word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + blob_box = blob->bounding_box(); + x_centre = (blob_box.left() + blob_box.right()) / 2.0; + ydiff = blob_box.bottom() - row->base_line(x_centre); + if (ydiff < 0) { + ydiff = -ydiff / row->x_height(); + } else { + ydiff = ydiff / row->x_height(); + } + if (ydiff < blshift_maxshift && blob_box.height() / row->x_height() > blshift_xfraction) { + if (xstarts[dest_index] >= x_centre) { + xstarts[dest_index] = blob_box.left(); + } + coeffs[dest_index * 3] = 0; + coeffs[dest_index * 3 + 1] = 0; + coeffs[dest_index * 3 + 2] = blob_box.bottom(); + // shift it + dest_index++; + xstarts[dest_index] = blob_box.right() + 1; + } else { + if (xstarts[dest_index] <= x_centre) { + while (row->baseline.xcoords[src_index + 1] <= x_centre && + src_index < row->baseline.segments - 1) { + if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) { + coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; + coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; + coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; + dest_index++; + xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; + } + src_index++; + } + coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; + coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; + coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; + dest_index++; + xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; + } + } + } + } + while (src_index < row->baseline.segments && + row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) { + src_index++; + } + while (src_index < row->baseline.segments) { + coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; + coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; + coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; + dest_index++; + src_index++; + xstarts[dest_index] = row->baseline.xcoords[src_index]; + } + // turn to spline + row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]); +} + +} // namespace tesseract
