comparison mupdf-source/thirdparty/tesseract/src/textord/devanagari_processing.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: devanagari_processing.cpp
3 * Description: Methods to process images containing devanagari symbols,
4 * prior to classification.
5 * Author: Shobhit Saxena
6 *
7 * (C) Copyright 2008, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "devanagari_processing.h"
25
26 #include "debugpixa.h"
27 #include "statistc.h"
28 #include "tordmain.h"
29
30 #include <allheaders.h>
31
32 namespace tesseract {
33
34 // Flags controlling the debugging information for shiro-rekha splitting
35 // strategies.
36 INT_VAR(devanagari_split_debuglevel, 0, "Debug level for split shiro-rekha process.");
37
38 BOOL_VAR(devanagari_split_debugimage, 0,
39 "Whether to create a debug image for split shiro-rekha process.");
40
41 ShiroRekhaSplitter::ShiroRekhaSplitter() :
42 orig_pix_(nullptr),
43 splitted_image_(nullptr),
44 pageseg_split_strategy_(NO_SPLIT),
45 ocr_split_strategy_(NO_SPLIT),
46 debug_image_(nullptr),
47 segmentation_block_list_(nullptr),
48 global_xheight_(kUnspecifiedXheight),
49 perform_close_(false)
50 {
51 }
52
53 ShiroRekhaSplitter::~ShiroRekhaSplitter() {
54 Clear();
55 }
56
57 void ShiroRekhaSplitter::Clear() {
58 orig_pix_.destroy();
59 splitted_image_.destroy();
60 pageseg_split_strategy_ = NO_SPLIT;
61 ocr_split_strategy_ = NO_SPLIT;
62 debug_image_.destroy();
63 segmentation_block_list_ = nullptr;
64 global_xheight_ = kUnspecifiedXheight;
65 perform_close_ = false;
66 }
67
68 // On setting the input image, a clone of it is owned by this class.
69 void ShiroRekhaSplitter::set_orig_pix(Image pix) {
70 if (orig_pix_) {
71 orig_pix_.destroy();
72 }
73 orig_pix_ = pix.clone();
74 }
75
76 // Top-level method to perform splitting based on current settings.
77 // Returns true if a split was actually performed.
78 // split_for_pageseg should be true if the splitting is being done prior to
79 // page segmentation. This mode uses the flag
80 // pageseg_devanagari_split_strategy to determine the splitting strategy.
81 bool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa *pixa_debug) {
82 SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : ocr_split_strategy_;
83 if (split_strategy == NO_SPLIT) {
84 return false; // Nothing to do.
85 }
86 ASSERT_HOST(split_strategy == MINIMAL_SPLIT || split_strategy == MAXIMAL_SPLIT);
87 ASSERT_HOST(orig_pix_);
88 if (devanagari_split_debuglevel > 0) {
89 tprintf("Splitting shiro-rekha ...\n");
90 tprintf("Split strategy = %s\n", split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
91 tprintf("Initial pageseg available = %s\n", segmentation_block_list_ ? "yes" : "no");
92 }
93 // Create a copy of original image to store the splitting output.
94 splitted_image_.destroy();
95 splitted_image_ = orig_pix_.copy();
96
97 // Initialize debug image if required.
98 if (devanagari_split_debugimage) {
99 debug_image_.destroy();
100 debug_image_ = pixConvertTo32(orig_pix_);
101 }
102
103 // Determine all connected components in the input image. A close operation
104 // may be required prior to this, depending on the current settings.
105 Image pix_for_ccs = orig_pix_.clone();
106 if (perform_close_ && global_xheight_ != kUnspecifiedXheight && !segmentation_block_list_) {
107 if (devanagari_split_debuglevel > 0) {
108 tprintf("Performing a global close operation..\n");
109 }
110 // A global measure is available for xheight, but no local information
111 // exists.
112 pix_for_ccs.destroy();
113 pix_for_ccs = orig_pix_.copy();
114 PerformClose(pix_for_ccs, global_xheight_);
115 }
116 Pixa *ccs;
117 Boxa *tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
118 boxaDestroy(&tmp_boxa);
119 pix_for_ccs.destroy();
120
121 // Iterate over all connected components. Get their bounding boxes and clip
122 // out the image regions corresponding to these boxes from the original image.
123 // Conditionally run splitting on each of them.
124 Boxa *regions_to_clear = boxaCreate(0);
125 int num_ccs = 0;
126 if (ccs != nullptr) {
127 num_ccs = pixaGetCount(ccs);
128 }
129 for (int i = 0; i < num_ccs; ++i) {
130 Box *box = pixaGetBox(ccs, i, L_CLONE);
131 Image word_pix = pixClipRectangle(orig_pix_, box, nullptr);
132 ASSERT_HOST(word_pix);
133 int xheight = GetXheightForCC(box);
134 if (xheight == kUnspecifiedXheight && segmentation_block_list_ && devanagari_split_debugimage) {
135 pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
136 }
137 // If some xheight measure is available, attempt to pre-eliminate small
138 // blobs from the shiro-rekha process. This is primarily to save the CCs
139 // corresponding to punctuation marks/small dots etc which are part of
140 // larger graphemes.
141 l_int32 x, y, w, h;
142 boxGetGeometry(box, &x, &y, &w, &h);
143 if (xheight == kUnspecifiedXheight || (w > xheight / 3 && h > xheight / 2)) {
144 SplitWordShiroRekha(split_strategy, word_pix, xheight, x, y, regions_to_clear);
145 } else if (devanagari_split_debuglevel > 0) {
146 tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", x, y, w, h);
147 }
148 word_pix.destroy();
149 boxDestroy(&box);
150 }
151 // Actually clear the boxes now.
152 for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
153 Box *box = boxaGetBox(regions_to_clear, i, L_CLONE);
154 pixClearInRect(splitted_image_, box);
155 boxDestroy(&box);
156 }
157 boxaDestroy(&regions_to_clear);
158 pixaDestroy(&ccs);
159 if (devanagari_split_debugimage && pixa_debug != nullptr) {
160 pixa_debug->AddPix(debug_image_, split_for_pageseg ? "pageseg_split" : "ocr_split");
161 }
162 return true;
163 }
164
165 // Method to perform a close operation on the input image. The xheight
166 // estimate decides the size of sel used.
167 void ShiroRekhaSplitter::PerformClose(Image pix, int xheight_estimate) {
168 pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
169 }
170
171 // This method resolves the cc bbox to a particular row and returns the row's
172 // xheight.
173 int ShiroRekhaSplitter::GetXheightForCC(Box *cc_bbox) {
174 if (!segmentation_block_list_) {
175 return global_xheight_;
176 }
177 // Compute the box coordinates in Tesseract's coordinate system.
178 l_int32 x, y, w, h;
179 boxGetGeometry(cc_bbox, &x, &y, &w, &h);
180 TBOX bbox(x, pixGetHeight(orig_pix_) - y - h - 1,
181 x + w, pixGetHeight(orig_pix_) - y - 1);
182 // Iterate over all blocks.
183 BLOCK_IT block_it(segmentation_block_list_);
184 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
185 BLOCK *block = block_it.data();
186 // Iterate over all rows in the block.
187 ROW_IT row_it(block->row_list());
188 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
189 ROW *row = row_it.data();
190 if (!row->bounding_box().major_overlap(bbox)) {
191 continue;
192 }
193 // Row could be skewed, warped, etc. Use the position of the box to
194 // determine the baseline position of the row for that x-coordinate.
195 // Create a square TBOX whose baseline's mid-point lies at this point
196 // and side is row's xheight. Take the overlap of this box with the input
197 // box and check if it is a 'major overlap'. If so, this box lies in this
198 // row. In that case, return the xheight for this row.
199 float box_middle = 0.5 * (bbox.left() + bbox.right());
200 int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
201 TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2,
202 static_cast<int>(baseline + row->x_height()));
203 // Compute overlap. If it is a major overlap, this is the right row.
204 if (bbox.major_overlap(test_box)) {
205 return row->x_height();
206 }
207 }
208 }
209 // No row found for this bbox.
210 return kUnspecifiedXheight;
211 }
212
213 // Returns a list of regions (boxes) which should be cleared in the original
214 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one
215 // (or less) word only. Xheight measure could be the global estimate, the row
216 // estimate, or unspecified. If unspecified, over splitting may occur, since a
217 // conservative estimate of stroke width along with an associated multiplier
218 // is used in its place. It is advisable to have a specified xheight when
219 // splitting for classification/training.
220 // A vertical projection histogram of all the on-pixels in the input pix is
221 // computed. The maxima of this histogram is regarded as an approximate location
222 // of the shiro-rekha. By descending on the maxima's peak on both sides,
223 // stroke width of shiro-rekha is estimated.
224 // A horizontal projection histogram is computed for a sub-image of the input
225 // image, which extends from just below the shiro-rekha down to a certain
226 // leeway. The leeway depends on the input xheight, if provided, else a
227 // conservative multiplier on approximate stroke width is used (which may lead
228 // to over-splitting).
229 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, Image pix, int xheight,
230 int word_left, int word_top, Boxa *regions_to_clear) {
231 if (split_strategy == NO_SPLIT) {
232 return;
233 }
234 int width = pixGetWidth(pix);
235 int height = pixGetHeight(pix);
236 // Statistically determine the yextents of the shiro-rekha.
237 int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
238 GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, &shirorekha_ylevel);
239 // Since the shiro rekha is also a stroke, its width is equal to the stroke
240 // width.
241 int stroke_width = shirorekha_bottom - shirorekha_top + 1;
242
243 // Some safeguards to protect CCs we do not want to be split.
244 // These are particularly useful when the word wasn't eliminated earlier
245 // because xheight information was unavailable.
246 if (shirorekha_ylevel > height / 2) {
247 // Shirorekha shouldn't be in the bottom half of the word.
248 if (devanagari_split_debuglevel > 0) {
249 tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n", word_left,
250 word_top);
251 }
252 return;
253 }
254 if (stroke_width > height / 3) {
255 // Even the boldest of fonts shouldn't do this.
256 if (devanagari_split_debuglevel > 0) {
257 tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n", word_left, word_top);
258 }
259 return;
260 }
261
262 // Clear the ascender and descender regions of the word.
263 // Obtain a vertical projection histogram for the resulting image.
264 Box *box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, width, 5 * stroke_width / 3);
265 Image word_in_xheight = pix.copy();
266 pixClearInRect(word_in_xheight, box_to_clear);
267 // Also clear any pixels which are below shirorekha_bottom + some leeway.
268 // The leeway is set to xheight if the information is available, else it is a
269 // multiplier applied to the stroke width.
270 int leeway_to_keep = stroke_width * 3;
271 if (xheight != kUnspecifiedXheight) {
272 // This is because the xheight-region typically includes the shiro-rekha
273 // inside it, i.e., the top of the xheight range corresponds to the top of
274 // shiro-rekha.
275 leeway_to_keep = xheight - stroke_width;
276 }
277 auto y = shirorekha_bottom + leeway_to_keep;
278 boxSetGeometry(box_to_clear, -1, y, -1, height - y);
279 pixClearInRect(word_in_xheight, box_to_clear);
280 boxDestroy(&box_to_clear);
281
282 PixelHistogram vert_hist;
283 vert_hist.ConstructVerticalCountHist(word_in_xheight);
284 word_in_xheight.destroy();
285
286 // If the number of black pixel in any column of the image is less than a
287 // fraction of the stroke width, treat it as noise / a stray mark. Perform
288 // these changes inside the vert_hist data itself, as that is used later on as
289 // a bit vector for the final split decision at every column.
290 for (int i = 0; i < width; ++i) {
291 if (vert_hist.hist()[i] <= stroke_width / 4) {
292 vert_hist.hist()[i] = 0;
293 } else {
294 vert_hist.hist()[i] = 1;
295 }
296 }
297 // In order to split the line at any point, we make sure that the width of the
298 // gap is at least half the stroke width.
299 int i = 0;
300 int cur_component_width = 0;
301 while (i < width) {
302 if (!vert_hist.hist()[i]) {
303 int j = 0;
304 while (i + j < width && !vert_hist.hist()[i + j]) {
305 ++j;
306 }
307 if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
308 // Perform a shiro-rekha split. The intervening region lies from i to
309 // i+j-1.
310 // A minimal single-pixel split makes the estimation of intra- and
311 // inter-word spacing easier during page layout analysis,
312 // whereas a maximal split may be needed for OCR, depending on
313 // how the engine was trained.
314 bool minimal_split = (split_strategy == MINIMAL_SPLIT);
315 int split_width = minimal_split ? 1 : j;
316 int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
317 if (!minimal_split || (i != 0 && i + j != width)) {
318 Box *box_to_clear =
319 boxCreate(word_left + split_left, word_top + shirorekha_top - stroke_width / 3,
320 split_width, 5 * stroke_width / 3);
321 if (box_to_clear) {
322 boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
323 // Mark this in the debug image if needed.
324 if (devanagari_split_debugimage) {
325 pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
326 }
327 boxDestroy(&box_to_clear);
328 cur_component_width = 0;
329 }
330 }
331 }
332 i += j;
333 } else {
334 ++i;
335 ++cur_component_width;
336 }
337 }
338 }
339
340 // Refreshes the words in the segmentation block list by using blobs in the
341 // input block list.
342 // The segmentation block list must be set.
343 void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs) {
344 // The segmentation block list must have been specified.
345 ASSERT_HOST(segmentation_block_list_);
346 if (devanagari_split_debuglevel > 0) {
347 tprintf("Before refreshing blobs:\n");
348 PrintSegmentationStats(segmentation_block_list_);
349 tprintf("New Blobs found: %d\n", new_blobs->length());
350 }
351
352 C_BLOB_LIST not_found_blobs;
353 RefreshWordBlobsFromNewBlobs(
354 segmentation_block_list_, new_blobs,
355 ((devanagari_split_debugimage && debug_image_) ? &not_found_blobs : nullptr));
356
357 if (devanagari_split_debuglevel > 0) {
358 tprintf("After refreshing blobs:\n");
359 PrintSegmentationStats(segmentation_block_list_);
360 }
361 if (devanagari_split_debugimage && debug_image_) {
362 // Plot out the original blobs for which no match was found in the new
363 // all_blobs list.
364 C_BLOB_IT not_found_it(&not_found_blobs);
365 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {
366 C_BLOB *not_found = not_found_it.data();
367 TBOX not_found_box = not_found->bounding_box();
368 Box *box_to_plot = GetBoxForTBOX(not_found_box);
369 pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
370 boxDestroy(&box_to_plot);
371 }
372
373 // Plot out the blobs unused from all blobs.
374 C_BLOB_IT all_blobs_it(new_blobs);
375 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {
376 C_BLOB *a_blob = all_blobs_it.data();
377 Box *box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
378 pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
379 boxDestroy(&box_to_plot);
380 }
381 }
382 }
383
384 // Returns a new box object for the corresponding TBOX, based on the original
385 // image's coordinate system.
386 Box *ShiroRekhaSplitter::GetBoxForTBOX(const TBOX &tbox) const {
387 return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, tbox.width(),
388 tbox.height());
389 }
390
391 // This method returns the computed mode-height of blobs in the pix.
392 // It also prunes very small blobs from calculation.
393 int ShiroRekhaSplitter::GetModeHeight(Image pix) {
394 Boxa *boxa = pixConnComp(pix, nullptr, 8);
395 STATS heights(0, pixGetHeight(pix) - 1);
396 heights.clear();
397 for (int i = 0; i < boxaGetCount(boxa); ++i) {
398 Box *box = boxaGetBox(boxa, i, L_CLONE);
399 l_int32 x, y, w, h;
400 boxGetGeometry(box, &x, &y, &w, &h);
401 if (h >= 3 || w >= 3) {
402 heights.add(h, 1);
403 }
404 boxDestroy(&box);
405 }
406 boxaDestroy(&boxa);
407 return heights.mode();
408 }
409
410 // This method returns y-extents of the shiro-rekha computed from the input
411 // word image.
412 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Image word_pix, int *shirorekha_top,
413 int *shirorekha_bottom, int *shirorekha_ylevel) {
414 // Compute a histogram from projecting the word on a vertical line.
415 PixelHistogram hist_horiz;
416 hist_horiz.ConstructHorizontalCountHist(word_pix);
417 // Get the ylevel where the top-line exists. This is basically the global
418 // maxima in the horizontal histogram.
419 int topline_onpixel_count = 0;
420 int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
421
422 // Get the upper and lower extents of the shiro rekha.
423 int thresh = (topline_onpixel_count * 70) / 100;
424 int ulimit = topline_ylevel;
425 int llimit = topline_ylevel;
426 while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) {
427 --ulimit;
428 }
429 while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh) {
430 ++llimit;
431 }
432
433 if (shirorekha_top) {
434 *shirorekha_top = ulimit;
435 }
436 if (shirorekha_bottom) {
437 *shirorekha_bottom = llimit;
438 }
439 if (shirorekha_ylevel) {
440 *shirorekha_ylevel = topline_ylevel;
441 }
442 }
443
444 // This method returns the global-maxima for the histogram. The frequency of
445 // the global maxima is returned in count, if specified.
446 int PixelHistogram::GetHistogramMaximum(int *count) const {
447 int best_value = 0;
448 for (int i = 0; i < length_; ++i) {
449 if (hist_[i] > hist_[best_value]) {
450 best_value = i;
451 }
452 }
453 if (count) {
454 *count = hist_[best_value];
455 }
456 return best_value;
457 }
458
459 // Methods to construct histograms from images.
460 void PixelHistogram::ConstructVerticalCountHist(Image pix) {
461 Clear();
462 int width = pixGetWidth(pix);
463 int height = pixGetHeight(pix);
464 hist_ = new int[width];
465 length_ = width;
466 int wpl = pixGetWpl(pix);
467 l_uint32 *data = pixGetData(pix);
468 for (int i = 0; i < width; ++i) {
469 hist_[i] = 0;
470 }
471 for (int i = 0; i < height; ++i) {
472 l_uint32 *line = data + i * wpl;
473 for (int j = 0; j < width; ++j) {
474 if (GET_DATA_BIT(line, j)) {
475 ++(hist_[j]);
476 }
477 }
478 }
479 }
480
481 void PixelHistogram::ConstructHorizontalCountHist(Image pix) {
482 Clear();
483 Numa *counts = pixCountPixelsByRow(pix, nullptr);
484 length_ = numaGetCount(counts);
485 hist_ = new int[length_];
486 for (int i = 0; i < length_; ++i) {
487 l_int32 val = 0;
488 numaGetIValue(counts, i, &val);
489 hist_[i] = val;
490 }
491 numaDestroy(&counts);
492 }
493
494 } // namespace tesseract.