Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/devanagari_processing.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/devanagari_processing.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,494 @@ +/********************************************************************** + * File: devanagari_processing.cpp + * Description: Methods to process images containing devanagari symbols, + * prior to classification. + * Author: Shobhit Saxena + * + * (C) Copyright 2008, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "devanagari_processing.h" + +#include "debugpixa.h" +#include "statistc.h" +#include "tordmain.h" + +#include <allheaders.h> + +namespace tesseract { + +// Flags controlling the debugging information for shiro-rekha splitting +// strategies. +INT_VAR(devanagari_split_debuglevel, 0, "Debug level for split shiro-rekha process."); + +BOOL_VAR(devanagari_split_debugimage, 0, + "Whether to create a debug image for split shiro-rekha process."); + +ShiroRekhaSplitter::ShiroRekhaSplitter() : + orig_pix_(nullptr), + splitted_image_(nullptr), + pageseg_split_strategy_(NO_SPLIT), + ocr_split_strategy_(NO_SPLIT), + debug_image_(nullptr), + segmentation_block_list_(nullptr), + global_xheight_(kUnspecifiedXheight), + perform_close_(false) +{ +} + +ShiroRekhaSplitter::~ShiroRekhaSplitter() { + Clear(); +} + +void ShiroRekhaSplitter::Clear() { + orig_pix_.destroy(); + splitted_image_.destroy(); + pageseg_split_strategy_ = NO_SPLIT; + ocr_split_strategy_ = NO_SPLIT; + debug_image_.destroy(); + segmentation_block_list_ = nullptr; + global_xheight_ = kUnspecifiedXheight; + perform_close_ = false; +} + +// On setting the input image, a clone of it is owned by this class. +void ShiroRekhaSplitter::set_orig_pix(Image pix) { + if (orig_pix_) { + orig_pix_.destroy(); + } + orig_pix_ = pix.clone(); +} + +// Top-level method to perform splitting based on current settings. +// Returns true if a split was actually performed. +// split_for_pageseg should be true if the splitting is being done prior to +// page segmentation. This mode uses the flag +// pageseg_devanagari_split_strategy to determine the splitting strategy. +bool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa *pixa_debug) { + SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : ocr_split_strategy_; + if (split_strategy == NO_SPLIT) { + return false; // Nothing to do. + } + ASSERT_HOST(split_strategy == MINIMAL_SPLIT || split_strategy == MAXIMAL_SPLIT); + ASSERT_HOST(orig_pix_); + if (devanagari_split_debuglevel > 0) { + tprintf("Splitting shiro-rekha ...\n"); + tprintf("Split strategy = %s\n", split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal"); + tprintf("Initial pageseg available = %s\n", segmentation_block_list_ ? "yes" : "no"); + } + // Create a copy of original image to store the splitting output. + splitted_image_.destroy(); + splitted_image_ = orig_pix_.copy(); + + // Initialize debug image if required. + if (devanagari_split_debugimage) { + debug_image_.destroy(); + debug_image_ = pixConvertTo32(orig_pix_); + } + + // Determine all connected components in the input image. A close operation + // may be required prior to this, depending on the current settings. + Image pix_for_ccs = orig_pix_.clone(); + if (perform_close_ && global_xheight_ != kUnspecifiedXheight && !segmentation_block_list_) { + if (devanagari_split_debuglevel > 0) { + tprintf("Performing a global close operation..\n"); + } + // A global measure is available for xheight, but no local information + // exists. + pix_for_ccs.destroy(); + pix_for_ccs = orig_pix_.copy(); + PerformClose(pix_for_ccs, global_xheight_); + } + Pixa *ccs; + Boxa *tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8); + boxaDestroy(&tmp_boxa); + pix_for_ccs.destroy(); + + // Iterate over all connected components. Get their bounding boxes and clip + // out the image regions corresponding to these boxes from the original image. + // Conditionally run splitting on each of them. + Boxa *regions_to_clear = boxaCreate(0); + int num_ccs = 0; + if (ccs != nullptr) { + num_ccs = pixaGetCount(ccs); + } + for (int i = 0; i < num_ccs; ++i) { + Box *box = pixaGetBox(ccs, i, L_CLONE); + Image word_pix = pixClipRectangle(orig_pix_, box, nullptr); + ASSERT_HOST(word_pix); + int xheight = GetXheightForCC(box); + if (xheight == kUnspecifiedXheight && segmentation_block_list_ && devanagari_split_debugimage) { + pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0); + } + // If some xheight measure is available, attempt to pre-eliminate small + // blobs from the shiro-rekha process. This is primarily to save the CCs + // corresponding to punctuation marks/small dots etc which are part of + // larger graphemes. + l_int32 x, y, w, h; + boxGetGeometry(box, &x, &y, &w, &h); + if (xheight == kUnspecifiedXheight || (w > xheight / 3 && h > xheight / 2)) { + SplitWordShiroRekha(split_strategy, word_pix, xheight, x, y, regions_to_clear); + } else if (devanagari_split_debuglevel > 0) { + tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", x, y, w, h); + } + word_pix.destroy(); + boxDestroy(&box); + } + // Actually clear the boxes now. + for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) { + Box *box = boxaGetBox(regions_to_clear, i, L_CLONE); + pixClearInRect(splitted_image_, box); + boxDestroy(&box); + } + boxaDestroy(®ions_to_clear); + pixaDestroy(&ccs); + if (devanagari_split_debugimage && pixa_debug != nullptr) { + pixa_debug->AddPix(debug_image_, split_for_pageseg ? "pageseg_split" : "ocr_split"); + } + return true; +} + +// Method to perform a close operation on the input image. The xheight +// estimate decides the size of sel used. +void ShiroRekhaSplitter::PerformClose(Image pix, int xheight_estimate) { + pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3); +} + +// This method resolves the cc bbox to a particular row and returns the row's +// xheight. +int ShiroRekhaSplitter::GetXheightForCC(Box *cc_bbox) { + if (!segmentation_block_list_) { + return global_xheight_; + } + // Compute the box coordinates in Tesseract's coordinate system. + l_int32 x, y, w, h; + boxGetGeometry(cc_bbox, &x, &y, &w, &h); + TBOX bbox(x, pixGetHeight(orig_pix_) - y - h - 1, + x + w, pixGetHeight(orig_pix_) - y - 1); + // Iterate over all blocks. + BLOCK_IT block_it(segmentation_block_list_); + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + BLOCK *block = block_it.data(); + // Iterate over all rows in the block. + ROW_IT row_it(block->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW *row = row_it.data(); + if (!row->bounding_box().major_overlap(bbox)) { + continue; + } + // Row could be skewed, warped, etc. Use the position of the box to + // determine the baseline position of the row for that x-coordinate. + // Create a square TBOX whose baseline's mid-point lies at this point + // and side is row's xheight. Take the overlap of this box with the input + // box and check if it is a 'major overlap'. If so, this box lies in this + // row. In that case, return the xheight for this row. + float box_middle = 0.5 * (bbox.left() + bbox.right()); + int baseline = static_cast<int>(row->base_line(box_middle) + 0.5); + TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2, + static_cast<int>(baseline + row->x_height())); + // Compute overlap. If it is a major overlap, this is the right row. + if (bbox.major_overlap(test_box)) { + return row->x_height(); + } + } + } + // No row found for this bbox. + return kUnspecifiedXheight; +} + +// Returns a list of regions (boxes) which should be cleared in the original +// image so as to perform shiro-rekha splitting. Pix is assumed to carry one +// (or less) word only. Xheight measure could be the global estimate, the row +// estimate, or unspecified. If unspecified, over splitting may occur, since a +// conservative estimate of stroke width along with an associated multiplier +// is used in its place. It is advisable to have a specified xheight when +// splitting for classification/training. +// A vertical projection histogram of all the on-pixels in the input pix is +// computed. The maxima of this histogram is regarded as an approximate location +// of the shiro-rekha. By descending on the maxima's peak on both sides, +// stroke width of shiro-rekha is estimated. +// A horizontal projection histogram is computed for a sub-image of the input +// image, which extends from just below the shiro-rekha down to a certain +// leeway. The leeway depends on the input xheight, if provided, else a +// conservative multiplier on approximate stroke width is used (which may lead +// to over-splitting). +void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, Image pix, int xheight, + int word_left, int word_top, Boxa *regions_to_clear) { + if (split_strategy == NO_SPLIT) { + return; + } + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + // Statistically determine the yextents of the shiro-rekha. + int shirorekha_top, shirorekha_bottom, shirorekha_ylevel; + GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, &shirorekha_ylevel); + // Since the shiro rekha is also a stroke, its width is equal to the stroke + // width. + int stroke_width = shirorekha_bottom - shirorekha_top + 1; + + // Some safeguards to protect CCs we do not want to be split. + // These are particularly useful when the word wasn't eliminated earlier + // because xheight information was unavailable. + if (shirorekha_ylevel > height / 2) { + // Shirorekha shouldn't be in the bottom half of the word. + if (devanagari_split_debuglevel > 0) { + tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n", word_left, + word_top); + } + return; + } + if (stroke_width > height / 3) { + // Even the boldest of fonts shouldn't do this. + if (devanagari_split_debuglevel > 0) { + tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n", word_left, word_top); + } + return; + } + + // Clear the ascender and descender regions of the word. + // Obtain a vertical projection histogram for the resulting image. + Box *box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, width, 5 * stroke_width / 3); + Image word_in_xheight = pix.copy(); + pixClearInRect(word_in_xheight, box_to_clear); + // Also clear any pixels which are below shirorekha_bottom + some leeway. + // The leeway is set to xheight if the information is available, else it is a + // multiplier applied to the stroke width. + int leeway_to_keep = stroke_width * 3; + if (xheight != kUnspecifiedXheight) { + // This is because the xheight-region typically includes the shiro-rekha + // inside it, i.e., the top of the xheight range corresponds to the top of + // shiro-rekha. + leeway_to_keep = xheight - stroke_width; + } + auto y = shirorekha_bottom + leeway_to_keep; + boxSetGeometry(box_to_clear, -1, y, -1, height - y); + pixClearInRect(word_in_xheight, box_to_clear); + boxDestroy(&box_to_clear); + + PixelHistogram vert_hist; + vert_hist.ConstructVerticalCountHist(word_in_xheight); + word_in_xheight.destroy(); + + // If the number of black pixel in any column of the image is less than a + // fraction of the stroke width, treat it as noise / a stray mark. Perform + // these changes inside the vert_hist data itself, as that is used later on as + // a bit vector for the final split decision at every column. + for (int i = 0; i < width; ++i) { + if (vert_hist.hist()[i] <= stroke_width / 4) { + vert_hist.hist()[i] = 0; + } else { + vert_hist.hist()[i] = 1; + } + } + // In order to split the line at any point, we make sure that the width of the + // gap is at least half the stroke width. + int i = 0; + int cur_component_width = 0; + while (i < width) { + if (!vert_hist.hist()[i]) { + int j = 0; + while (i + j < width && !vert_hist.hist()[i + j]) { + ++j; + } + if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) { + // Perform a shiro-rekha split. The intervening region lies from i to + // i+j-1. + // A minimal single-pixel split makes the estimation of intra- and + // inter-word spacing easier during page layout analysis, + // whereas a maximal split may be needed for OCR, depending on + // how the engine was trained. + bool minimal_split = (split_strategy == MINIMAL_SPLIT); + int split_width = minimal_split ? 1 : j; + int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i; + if (!minimal_split || (i != 0 && i + j != width)) { + Box *box_to_clear = + boxCreate(word_left + split_left, word_top + shirorekha_top - stroke_width / 3, + split_width, 5 * stroke_width / 3); + if (box_to_clear) { + boxaAddBox(regions_to_clear, box_to_clear, L_CLONE); + // Mark this in the debug image if needed. + if (devanagari_split_debugimage) { + pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128); + } + boxDestroy(&box_to_clear); + cur_component_width = 0; + } + } + } + i += j; + } else { + ++i; + ++cur_component_width; + } + } +} + +// Refreshes the words in the segmentation block list by using blobs in the +// input block list. +// The segmentation block list must be set. +void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs) { + // The segmentation block list must have been specified. + ASSERT_HOST(segmentation_block_list_); + if (devanagari_split_debuglevel > 0) { + tprintf("Before refreshing blobs:\n"); + PrintSegmentationStats(segmentation_block_list_); + tprintf("New Blobs found: %d\n", new_blobs->length()); + } + + C_BLOB_LIST not_found_blobs; + RefreshWordBlobsFromNewBlobs( + segmentation_block_list_, new_blobs, + ((devanagari_split_debugimage && debug_image_) ? ¬_found_blobs : nullptr)); + + if (devanagari_split_debuglevel > 0) { + tprintf("After refreshing blobs:\n"); + PrintSegmentationStats(segmentation_block_list_); + } + if (devanagari_split_debugimage && debug_image_) { + // Plot out the original blobs for which no match was found in the new + // all_blobs list. + C_BLOB_IT not_found_it(¬_found_blobs); + for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) { + C_BLOB *not_found = not_found_it.data(); + TBOX not_found_box = not_found->bounding_box(); + Box *box_to_plot = GetBoxForTBOX(not_found_box); + pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255); + boxDestroy(&box_to_plot); + } + + // Plot out the blobs unused from all blobs. + C_BLOB_IT all_blobs_it(new_blobs); + for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) { + C_BLOB *a_blob = all_blobs_it.data(); + Box *box_to_plot = GetBoxForTBOX(a_blob->bounding_box()); + pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0); + boxDestroy(&box_to_plot); + } + } +} + +// Returns a new box object for the corresponding TBOX, based on the original +// image's coordinate system. +Box *ShiroRekhaSplitter::GetBoxForTBOX(const TBOX &tbox) const { + return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, tbox.width(), + tbox.height()); +} + +// This method returns the computed mode-height of blobs in the pix. +// It also prunes very small blobs from calculation. +int ShiroRekhaSplitter::GetModeHeight(Image pix) { + Boxa *boxa = pixConnComp(pix, nullptr, 8); + STATS heights(0, pixGetHeight(pix) - 1); + heights.clear(); + for (int i = 0; i < boxaGetCount(boxa); ++i) { + Box *box = boxaGetBox(boxa, i, L_CLONE); + l_int32 x, y, w, h; + boxGetGeometry(box, &x, &y, &w, &h); + if (h >= 3 || w >= 3) { + heights.add(h, 1); + } + boxDestroy(&box); + } + boxaDestroy(&boxa); + return heights.mode(); +} + +// This method returns y-extents of the shiro-rekha computed from the input +// word image. +void ShiroRekhaSplitter::GetShiroRekhaYExtents(Image word_pix, int *shirorekha_top, + int *shirorekha_bottom, int *shirorekha_ylevel) { + // Compute a histogram from projecting the word on a vertical line. + PixelHistogram hist_horiz; + hist_horiz.ConstructHorizontalCountHist(word_pix); + // Get the ylevel where the top-line exists. This is basically the global + // maxima in the horizontal histogram. + int topline_onpixel_count = 0; + int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count); + + // Get the upper and lower extents of the shiro rekha. + int thresh = (topline_onpixel_count * 70) / 100; + int ulimit = topline_ylevel; + int llimit = topline_ylevel; + while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) { + --ulimit; + } + while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh) { + ++llimit; + } + + if (shirorekha_top) { + *shirorekha_top = ulimit; + } + if (shirorekha_bottom) { + *shirorekha_bottom = llimit; + } + if (shirorekha_ylevel) { + *shirorekha_ylevel = topline_ylevel; + } +} + +// This method returns the global-maxima for the histogram. The frequency of +// the global maxima is returned in count, if specified. +int PixelHistogram::GetHistogramMaximum(int *count) const { + int best_value = 0; + for (int i = 0; i < length_; ++i) { + if (hist_[i] > hist_[best_value]) { + best_value = i; + } + } + if (count) { + *count = hist_[best_value]; + } + return best_value; +} + +// Methods to construct histograms from images. +void PixelHistogram::ConstructVerticalCountHist(Image pix) { + Clear(); + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + hist_ = new int[width]; + length_ = width; + int wpl = pixGetWpl(pix); + l_uint32 *data = pixGetData(pix); + for (int i = 0; i < width; ++i) { + hist_[i] = 0; + } + for (int i = 0; i < height; ++i) { + l_uint32 *line = data + i * wpl; + for (int j = 0; j < width; ++j) { + if (GET_DATA_BIT(line, j)) { + ++(hist_[j]); + } + } + } +} + +void PixelHistogram::ConstructHorizontalCountHist(Image pix) { + Clear(); + Numa *counts = pixCountPixelsByRow(pix, nullptr); + length_ = numaGetCount(counts); + hist_ = new int[length_]; + for (int i = 0; i < length_; ++i) { + l_int32 val = 0; + numaGetIValue(counts, i, &val); + hist_[i] = val; + } + numaDestroy(&counts); +} + +} // namespace tesseract.
