Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/wordseg.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/wordseg.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,605 @@ +/********************************************************************** + * File: wordseg.cpp (Formerly wspace.c) + * Description: Code to segment the blobs into words. + * Author: Ray Smith + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "wordseg.h" + +#include <cmath> + +#include "blobbox.h" +#include "cjkpitch.h" +#include "drawtord.h" +#include "fpchop.h" +#include "makerow.h" +#include "pitsync1.h" +#include "statistc.h" +#include "textord.h" +#include "topitch.h" +#include "tovars.h" + +namespace tesseract { + +BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows"); +BOOL_VAR(textord_chopper_test, false, "Chopper is being tested."); + +#define BLOCK_STATS_CLUSTERS 10 + +/** + * @name make_single_word + * + * For each row, arrange the blobs into one word. There is no fixed + * pitch detection. + */ + +void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) { + TO_ROW_IT to_row_it(rows); + ROW_IT row_it(real_rows); + for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) { + TO_ROW *row = to_row_it.data(); + // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready + // to create the word. + C_BLOB_LIST cblobs; + C_BLOB_IT cblob_it(&cblobs); + BLOBNBOX_IT box_it(row->blob_list()); + for (; !box_it.empty(); box_it.forward()) { + BLOBNBOX *bblob = box_it.extract(); + if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { + auto cblob = bblob->remove_cblob(); + if (cblob != nullptr) { + C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); + cout_it.move_to_last(); + cout_it.add_list_after(cblob->out_list()); + delete cblob; + } + } else { + auto cblob = bblob->remove_cblob(); + if (cblob != nullptr) { + cblob_it.add_after_then_move(cblob); + } + } + delete bblob; + } + // Convert the TO_ROW to a ROW. + ROW *real_row = + new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); + WERD_IT word_it(real_row->word_list()); + WERD *word = new WERD(&cblobs, 0, nullptr); + word->set_flag(W_BOL, true); + word->set_flag(W_EOL, true); + word->set_flag(W_DONT_CHOP, one_blob); + word_it.add_after_then_move(word); + real_row->recalc_bounding_box(); + row_it.add_after_then_move(real_row); + } +} + +/** + * make_words + * + * Arrange the blobs into words. + */ +void make_words(tesseract::Textord *textord, + ICOORD page_tr, // top right + float gradient, // page skew + BLOCK_LIST *blocks, // block list + TO_BLOCK_LIST *port_blocks) { // output list + TO_BLOCK_IT block_it; // iterator + TO_BLOCK *block; // current block + + if (textord->use_cjk_fp_model()) { + compute_fixed_pitch_cjk(page_tr, port_blocks); + } else { + compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f), + !bool(textord_test_landscape)); + } + textord->to_spacing(page_tr, port_blocks); + block_it.set_to_list(port_blocks); + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + block = block_it.data(); + make_real_words(textord, block, FCOORD(1.0f, 0.0f)); + } +} + +/** + * @name set_row_spaces + * + * Set the min_space and max_nonspace members of the row so that + * the blobs can be arranged into words. + */ + +void set_row_spaces( // find space sizes + TO_BLOCK *block, // block to do + FCOORD rotation, // for drawing + bool testing_on // correct orientation +) { + TO_ROW *row; // current row + TO_ROW_IT row_it = block->get_rows(); + + if (row_it.empty()) { + return; // empty block + } + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if (row->fixed_pitch == 0) { + row->min_space = static_cast<int32_t>( + ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); + row->max_nonspace = static_cast<int32_t>( + floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); + if (testing_on && textord_show_initial_words) { + tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace, + row->min_space, row->intercept()); + } + row->space_threshold = (row->max_nonspace + row->min_space) / 2; + row->space_size = row->pr_space; + row->kern_size = row->pr_nonsp; + } +#ifndef GRAPHICS_DISABLED + if (textord_show_initial_words && testing_on) { + plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row); + } +#endif + } +} + +/** + * @name row_words + * + * Compute the max nonspace and min space for the row. + */ + +int32_t row_words( // compute space size + TO_BLOCK *block, // block it came from + TO_ROW *row, // row to operate on + int32_t maxwidth, // max expected space size + FCOORD rotation, // for drawing + bool testing_on // for debug +) { + bool testing_row; // contains testpt + bool prev_valid; // if decent size + int32_t prev_x; // end of prev blob + int32_t cluster_count; // no of clusters + int32_t gap_index; // which cluster + int32_t smooth_factor; // for smoothing stats + BLOBNBOX *blob; // current blob + float lower, upper; // clustering parameters + float gaps[3]; // gap clusers + ICOORD testpt; + TBOX blob_box; // bounding box + // iterator + BLOBNBOX_IT blob_it = row->blob_list(); + STATS gap_stats(0, maxwidth - 1); + STATS cluster_stats[4]; // clusters + + testpt = ICOORD(textord_test_x, textord_test_y); + smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); + // if (testing_on) + // tprintf("Row smooth factor=%d\n",smooth_factor); + prev_valid = false; + prev_x = -INT32_MAX; + testing_row = false; + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + blob_box = blob->bounding_box(); + if (blob_box.contains(testpt)) { + testing_row = true; + } + gap_stats.add(blob_box.width(), 1); + } + gap_stats.clear(); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + if (!blob->joined_to_prev()) { + blob_box = blob->bounding_box(); + if (prev_valid && blob_box.left() - prev_x < maxwidth) { + gap_stats.add(blob_box.left() - prev_x, 1); + } + prev_valid = true; + prev_x = blob_box.right(); + } + } + if (gap_stats.get_total() == 0) { + row->min_space = 0; // no evidence + row->max_nonspace = 0; + return 0; + } + gap_stats.smooth(smooth_factor); + lower = row->xheight * textord_words_initial_lower; + upper = row->xheight * textord_words_initial_upper; + cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); + while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) { + // shrink gap + upper = (upper * 3 + lower) / 4; + lower = (lower * 3 + upper) / 4; + cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); + } + if (cluster_count < 2) { + row->min_space = 0; // no evidence + row->max_nonspace = 0; + return 0; + } + for (gap_index = 0; gap_index < cluster_count; gap_index++) { + gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); + } + // get medians + if (cluster_count > 2) { + if (testing_on && textord_show_initial_words) { + tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(), + cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5)); + } + lower = gaps[0]; + if (gaps[1] > lower) { + upper = gaps[1]; // prefer most frequent + if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) { + upper = gaps[2]; + } + } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) { + upper = gaps[2]; + } else if (lower >= block->xheight * textord_words_min_minspace) { + upper = lower; // not nice + lower = gaps[1]; + if (testing_on && textord_show_initial_words) { + tprintf("Had to switch most common from lower to upper!!\n"); + gap_stats.print(); + } + } else { + row->min_space = 0; // no evidence + row->max_nonspace = 0; + return 0; + } + } else { + if (gaps[1] < gaps[0]) { + if (testing_on && textord_show_initial_words) { + tprintf("Had to switch most common from lower to upper!!\n"); + gap_stats.print(); + } + lower = gaps[1]; + upper = gaps[0]; + } else { + upper = gaps[1]; + lower = gaps[0]; + } + } + if (upper < block->xheight * textord_words_min_minspace) { + row->min_space = 0; // no evidence + row->max_nonspace = 0; + return 0; + } + if (upper * 3 < block->min_space * 2 + block->max_nonspace || + lower * 3 > block->min_space * 2 + block->max_nonspace) { + if (testing_on && textord_show_initial_words) { + tprintf("Disagreement between block and row at %g!!\n", row->intercept()); + tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper); + gap_stats.print(); + } + } + row->min_space = + static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread)); + row->max_nonspace = + static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread)); + row->space_threshold = (row->max_nonspace + row->min_space) / 2; + row->space_size = upper; + row->kern_size = lower; + if (testing_on && textord_show_initial_words) { + if (testing_row) { + tprintf("GAP STATS\n"); + gap_stats.print(); + tprintf("SPACE stats\n"); + cluster_stats[2].print_summary(); + tprintf("NONSPACE stats\n"); + cluster_stats[1].print_summary(); + } + tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space, + upper, row->max_nonspace, lower); + } + return cluster_stats[2].get_total(); +} + +/** + * @name row_words2 + * + * Compute the max nonspace and min space for the row. + */ + +int32_t row_words2( // compute space size + TO_BLOCK *block, // block it came from + TO_ROW *row, // row to operate on + int32_t maxwidth, // max expected space size + FCOORD rotation, // for drawing + bool testing_on // for debug +) { + bool prev_valid; // if decent size + bool this_valid; // current blob big enough + int32_t prev_x; // end of prev blob + int32_t min_width; // min interesting width + int32_t valid_count; // good gaps + int32_t total_count; // total gaps + int32_t cluster_count; // no of clusters + int32_t prev_count; // previous cluster_count + int32_t gap_index; // which cluster + int32_t smooth_factor; // for smoothing stats + BLOBNBOX *blob; // current blob + float lower, upper; // clustering parameters + ICOORD testpt; + TBOX blob_box; // bounding box + // iterator + BLOBNBOX_IT blob_it = row->blob_list(); + STATS gap_stats(0, maxwidth - 1); + // gap sizes + float gaps[BLOCK_STATS_CLUSTERS]; + STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; + // clusters + + testpt = ICOORD(textord_test_x, textord_test_y); + smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); + // if (testing_on) + // tprintf("Row smooth factor=%d\n",smooth_factor); + prev_valid = false; + prev_x = -INT16_MAX; + const bool testing_row = false; + // min blob size + min_width = static_cast<int32_t>(block->pr_space); + total_count = 0; + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + if (!blob->joined_to_prev()) { + blob_box = blob->bounding_box(); + this_valid = blob_box.width() >= min_width; + if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) { + gap_stats.add(blob_box.left() - prev_x, 1); + } + total_count++; // count possibles + prev_x = blob_box.right(); + prev_valid = this_valid; + } + } + valid_count = gap_stats.get_total(); + if (valid_count < total_count * textord_words_minlarge) { + gap_stats.clear(); + prev_x = -INT16_MAX; + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + blob = blob_it.data(); + if (!blob->joined_to_prev()) { + blob_box = blob->bounding_box(); + if (blob_box.left() - prev_x < maxwidth) { + gap_stats.add(blob_box.left() - prev_x, 1); + } + prev_x = blob_box.right(); + } + } + } + if (gap_stats.get_total() == 0) { + row->min_space = 0; // no evidence + row->max_nonspace = 0; + return 0; + } + + cluster_count = 0; + lower = block->xheight * words_initial_lower; + upper = block->xheight * words_initial_upper; + gap_stats.smooth(smooth_factor); + do { + prev_count = cluster_count; + cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, + BLOCK_STATS_CLUSTERS, cluster_stats); + } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); + if (cluster_count < 1) { + row->min_space = 0; + row->max_nonspace = 0; + return 0; + } + for (gap_index = 0; gap_index < cluster_count; gap_index++) { + gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); + } + // get medians + if (testing_on) { + tprintf("cluster_count=%d:", cluster_count); + for (gap_index = 0; gap_index < cluster_count; gap_index++) { + tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total()); + } + tprintf("\n"); + } + + // Try to find proportional non-space and space for row. + for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace; + gap_index++) { + ; + } + if (gap_index < cluster_count) { + lower = gaps[gap_index]; // most frequent below + } else { + if (testing_on) { + tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp); + } + lower = block->pr_nonsp; + } + for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace; + gap_index++) { + ; + } + if (gap_index < cluster_count) { + upper = gaps[gap_index]; // most frequent above + } else { + if (testing_on) { + tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space); + } + upper = block->pr_space; + } + row->min_space = + static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread)); + row->max_nonspace = + static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread)); + row->space_threshold = (row->max_nonspace + row->min_space) / 2; + row->space_size = upper; + row->kern_size = lower; + if (testing_on) { + if (testing_row) { + tprintf("GAP STATS\n"); + gap_stats.print(); + tprintf("SPACE stats\n"); + cluster_stats[2].print_summary(); + tprintf("NONSPACE stats\n"); + cluster_stats[1].print_summary(); + } + tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space, + upper, row->max_nonspace, lower); + } + return 1; +} + +/** + * @name make_real_words + * + * Convert a TO_BLOCK to a BLOCK. + */ + +void make_real_words(tesseract::Textord *textord, + TO_BLOCK *block, // block to do + FCOORD rotation // for drawing +) { + TO_ROW *row; // current row + TO_ROW_IT row_it = block->get_rows(); + ROW *real_row = nullptr; // output row + ROW_IT real_row_it = block->block->row_list(); + + if (row_it.empty()) { + return; // empty block + } + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if (row->blob_list()->empty() && !row->rep_words.empty()) { + real_row = make_rep_words(row, block); + } else if (!row->blob_list()->empty()) { + // In a fixed pitch document, some lines may be detected as fixed pitch + // while others don't, and will go through different path. + // For non-space delimited language like CJK, fixed pitch chop always + // leave the entire line as one word. We can force consistent chopping + // with force_make_prop_words flag. + POLY_BLOCK *pb = block->block->pdblk.poly_block(); + if (textord_chopper_test) { + real_row = textord->make_blob_words(row, rotation); + } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) || + row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) { + real_row = textord->make_prop_words(row, rotation); + } else if (row->pitch_decision == PITCH_DEF_FIXED || + row->pitch_decision == PITCH_CORR_FIXED) { + real_row = fixed_pitch_words(row, rotation); + } else { + ASSERT_HOST(false); + } + } + if (real_row != nullptr) { + // put row in block + real_row_it.add_after_then_move(real_row); + } + } + block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size), + static_cast<int16_t>(block->space_size), + static_cast<int16_t>(block->fixed_pitch)); + block->block->check_pitch(); +} + +/** + * @name make_rep_words + * + * Fabricate a real row from only the repeated blob words. + * Get the xheight from the block as it may be more meaningful. + */ + +ROW *make_rep_words( // make a row + TO_ROW *row, // row to convert + TO_BLOCK *block // block it lives in +) { + ROW *real_row; // output row + TBOX word_box; // bounding box + // iterator + WERD_IT word_it = &row->rep_words; + + if (word_it.empty()) { + return nullptr; + } + word_box = word_it.data()->bounding_box(); + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word_box += word_it.data()->bounding_box(); + } + row->xheight = block->xheight; + real_row = + new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size)); + word_it.set_to_list(real_row->word_list()); + // put words in row + word_it.add_list_after(&row->rep_words); + real_row->recalc_bounding_box(); + return real_row; +} + +/** + * @name make_real_word + * + * Construct a WERD from a given number of adjacent entries in a + * list of BLOBNBOXs. + */ + +WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator + int32_t blobcount, // no of blobs to use + bool bol, // start of line + uint8_t blanks // no of blanks +) { + C_OUTLINE_IT cout_it; + C_BLOB_LIST cblobs; + C_BLOB_IT cblob_it = &cblobs; + + for (int blobindex = 0; blobindex < blobcount; blobindex++) { + auto bblob = box_it->extract(); + if (bblob->joined_to_prev()) { + auto cblob = bblob->remove_cblob(); + if (cblob != nullptr) { + cout_it.set_to_list(cblob_it.data()->out_list()); + cout_it.move_to_last(); + cout_it.add_list_after(cblob->out_list()); + delete cblob; + } + } else { + auto cblob = bblob->remove_cblob(); + if (cblob != nullptr) { + cblob_it.add_after_then_move(cblob); + } + } + delete bblob; + box_it->forward(); // next one + } + + if (blanks < 1) { + blanks = 1; + } + + auto word = new WERD(&cblobs, blanks, nullptr); + + if (bol) { + word->set_flag(W_BOL, true); + } + if (box_it->at_first()) { + word->set_flag(W_EOL, true); // at end of line + } + + return word; +} + +} // namespace tesseract
