Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/textord.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/textord.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,295 @@ +/////////////////////////////////////////////////////////////////////// +// File: textord.cpp +// Description: The top-level text line and word finding functionality. +// Author: Ray Smith +// Created: Fri Mar 13 14:43:01 PDT 2009 +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "baselinedetect.h" +#include "drawtord.h" +#include "makerow.h" +#include "pageres.h" +#include "textord.h" +#include "tordmain.h" +#include "wordseg.h" + +namespace tesseract { + +Textord::Textord(CCStruct *ccstruct) + : ccstruct_(ccstruct) + , use_cjk_fp_model_(false) + , + // makerow.cpp /////////////////////////////////////////// + BOOL_MEMBER(textord_single_height_mode, false, "Script has no xheight, so use a single mode", + ccstruct_->params()) + , + // tospace.cpp /////////////////////////////////////////// + BOOL_MEMBER(tosp_old_to_method, false, "Space stats use prechopping?", ccstruct_->params()) + , BOOL_MEMBER(tosp_old_to_constrain_sp_kn, false, + "Constrain relative values of inter and intra-word gaps for " + "old_to_method.", + ccstruct_->params()) + , BOOL_MEMBER(tosp_only_use_prop_rows, true, "Block stats to use fixed pitch rows?", + ccstruct_->params()) + , BOOL_MEMBER(tosp_force_wordbreak_on_punct, false, + "Force word breaks on punct to break long lines in non-space " + "delimited langs", + ccstruct_->params()) + , BOOL_MEMBER(tosp_use_pre_chopping, false, "Space stats use prechopping?", ccstruct_->params()) + , BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code", ccstruct_->params()) + , BOOL_MEMBER(tosp_block_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()) + , BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()) + , BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, "Only stat OBVIOUS spaces", ccstruct_->params()) + , BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces", ccstruct_->params()) + , BOOL_MEMBER(tosp_recovery_isolated_row_stats, true, + "Use row alone when inadequate cert spaces", ccstruct_->params()) + , BOOL_MEMBER(tosp_only_small_gaps_for_kern, false, "Better guess", ccstruct_->params()) + , BOOL_MEMBER(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?", ccstruct_->params()) + , BOOL_MEMBER(tosp_fuzzy_limit_all, true, "Don't restrict kn->sp fuzzy limit to tables", + ccstruct_->params()) + , BOOL_MEMBER(tosp_stats_use_xht_gaps, true, "Use within xht gap for wd breaks", + ccstruct_->params()) + , BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks", ccstruct_->params()) + , BOOL_MEMBER(tosp_only_use_xht_gaps, false, "Only use within xht gap for wd breaks", + ccstruct_->params()) + , BOOL_MEMBER(tosp_rule_9_test_punct, false, "Don't chng kn to space next to punct", + ccstruct_->params()) + , BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip", ccstruct_->params()) + , BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip", ccstruct_->params()) + , BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic", ccstruct_->params()) + , INT_MEMBER(tosp_debug_level, 0, "Debug data", ccstruct_->params()) + , INT_MEMBER(tosp_enough_space_samples_for_median, 3, "or should we use mean", + ccstruct_->params()) + , INT_MEMBER(tosp_redo_kern_limit, 10, "No.samples reqd to reestimate for row", + ccstruct_->params()) + , INT_MEMBER(tosp_few_samples, 40, "No.gaps reqd with 1 large gap to treat as a table", + ccstruct_->params()) + , INT_MEMBER(tosp_short_row, 20, "No.gaps reqd with few cert spaces to use certs", + ccstruct_->params()) + , INT_MEMBER(tosp_sanity_method, 1, "How to avoid being silly", ccstruct_->params()) + , double_MEMBER(tosp_old_sp_kn_th_factor, 2.0, + "Factor for defining space threshold in terms of space and " + "kern sizes", + ccstruct_->params()) + , double_MEMBER(tosp_threshold_bias1, 0, "how far between kern and space?", ccstruct_->params()) + , double_MEMBER(tosp_threshold_bias2, 0, "how far between kern and space?", ccstruct_->params()) + , double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow", ccstruct_->params()) + , double_MEMBER(tosp_narrow_aspect_ratio, 0.48, "narrow if w/h less than this", + ccstruct_->params()) + , double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide", ccstruct_->params()) + , double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this", ccstruct_->params()) + , double_MEMBER(tosp_fuzzy_space_factor, 0.6, "Fract of xheight for fuzz sp", + ccstruct_->params()) + , double_MEMBER(tosp_fuzzy_space_factor1, 0.5, "Fract of xheight for fuzz sp", + ccstruct_->params()) + , double_MEMBER(tosp_fuzzy_space_factor2, 0.72, "Fract of xheight for fuzz sp", + ccstruct_->params()) + , double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern", ccstruct_->params()) + , double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp", ccstruct_->params()) + , double_MEMBER(tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp", ccstruct_->params()) + , double_MEMBER(tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp", ccstruct_->params()) + , double_MEMBER(tosp_ignore_big_gaps, -1, "xht multiplier", ccstruct_->params()) + , double_MEMBER(tosp_ignore_very_big_gaps, 3.5, "xht multiplier", ccstruct_->params()) + , double_MEMBER(tosp_rep_space, 1.6, "rep gap multiplier for space", ccstruct_->params()) + , double_MEMBER(tosp_enough_small_gaps, 0.65, "Fract of kerns reqd for isolated row stats", + ccstruct_->params()) + , double_MEMBER(tosp_table_kn_sp_ratio, 2.25, "Min difference of kn & sp in table", + ccstruct_->params()) + , double_MEMBER(tosp_table_xht_sp_ratio, 0.33, "Expect spaces bigger than this", + ccstruct_->params()) + , double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0, "Fuzzy if less than this", + ccstruct_->params()) + , double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg", ccstruct_->params()) + , double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg", ccstruct_->params()) + , double_MEMBER(tosp_min_sane_kn_sp, 1.5, "Don't trust spaces less than this time kn", + ccstruct_->params()) + , double_MEMBER(tosp_init_guess_kn_mult, 2.2, "Thresh guess - mult kn by this", + ccstruct_->params()) + , double_MEMBER(tosp_init_guess_xht_mult, 0.28, "Thresh guess - mult xht by this", + ccstruct_->params()) + , double_MEMBER(tosp_max_sane_kn_thresh, 5.0, "Multiplier on kn to limit thresh", + ccstruct_->params()) + , double_MEMBER(tosp_flip_caution, 0.0, "Don't autoflip kn to sp when large separation", + ccstruct_->params()) + , double_MEMBER(tosp_large_kerning, 0.19, "Limit use of xht gap with large kns", + ccstruct_->params()) + , double_MEMBER(tosp_dont_fool_with_small_kerns, -1, "Limit use of xht gap with odd small kns", + ccstruct_->params()) + , double_MEMBER(tosp_near_lh_edge, 0, "Don't reduce box if the top left is non blank", + ccstruct_->params()) + , double_MEMBER(tosp_silly_kn_sp_gap, 0.2, "Don't let sp minus kn get too small", + ccstruct_->params()) + , double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75, "How wide fuzzies need context", + ccstruct_->params()) + , + // tordmain.cpp /////////////////////////////////////////// + BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs", ccstruct_->params()) + , BOOL_MEMBER(textord_show_blobs, false, "Display unsorted blobs", ccstruct_->params()) + , BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs", ccstruct_->params()) + , INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise", ccstruct_->params()) + , INT_MEMBER(textord_baseline_debug, 0, "Baseline debug level", ccstruct_->params()) + , double_MEMBER(textord_noise_area_ratio, 0.7, "Fraction of bounding box for noise", + ccstruct_->params()) + , double_MEMBER(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess", + ccstruct_->params()) + , double_MEMBER(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess", + ccstruct_->params()) + , INT_MEMBER(textord_noise_sizefraction, 10, "Fraction of size for maxima", ccstruct_->params()) + , double_MEMBER(textord_noise_sizelimit, 0.5, "Fraction of x for big t count", + ccstruct_->params()) + , INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob", ccstruct_->params()) + , double_MEMBER(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion", + ccstruct_->params()) + , BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words", ccstruct_->params()) + , BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows", ccstruct_->params()) + , double_MEMBER(textord_noise_syfract, 0.2, "xh fract height error for norm blobs", + ccstruct_->params()) + , double_MEMBER(textord_noise_sxfract, 0.4, "xh fract width error for norm blobs", + ccstruct_->params()) + , double_MEMBER(textord_noise_hfract, 1.0 / 64, + "Height fraction to discard outlines as speckle noise", ccstruct_->params()) + , INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row", ccstruct_->params()) + , double_MEMBER(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion", + ccstruct_->params()) + , BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector", ccstruct_->params()) + , double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift", ccstruct_->params()) + , double_MEMBER(textord_blshift_xfraction, 9.99, "Min size of baseline shift", + ccstruct_->params()) {} + +// Make the textlines and words inside each block. +void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, + Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms, + BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, + TO_BLOCK_LIST *to_blocks, float *gradient) { + page_tr_.set_x(width); + page_tr_.set_y(height); + if (to_blocks->empty()) { + // AutoPageSeg was not used, so we need to find_components first. + find_components(binary_pix, blocks, to_blocks); + TO_BLOCK_IT it(to_blocks); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + TO_BLOCK *to_block = it.data(); + // Compute the edge offsets whether or not there is a grey_pix. + // We have by-passed auto page seg, so we have to run it here. + // By page segmentation mode there is no non-text to avoid running on. + to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix); + } + } else if (!PSM_SPARSE(pageseg_mode)) { + // AutoPageSeg does not need to find_components as it did that already. + // Filter_blobs sets up the TO_BLOCKs the same as find_components does. + filter_blobs(page_tr_, to_blocks, true); + } + + ASSERT_HOST(!to_blocks->empty()); + if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) { + const FCOORD anticlockwise90(0.0f, 1.0f); + const FCOORD clockwise90(0.0f, -1.0f); + TO_BLOCK_IT it(to_blocks); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + TO_BLOCK *to_block = it.data(); + BLOCK *block = to_block->block; + // Create a fake poly_block in block from its bounding box. + block->pdblk.set_poly_block(new POLY_BLOCK(block->pdblk.bounding_box(), PT_VERTICAL_TEXT)); + // Rotate the to_block along with its contained block and blobnbox lists. + to_block->rotate(anticlockwise90); + // Set the block's rotation values to obey the convention followed in + // layout analysis for vertical text. + block->set_re_rotation(clockwise90); + block->set_classify_rotation(clockwise90); + } + } + + TO_BLOCK_IT to_block_it(to_blocks); + TO_BLOCK *to_block = to_block_it.data(); + // Make the rows in the block. + // Do it the old fashioned way. + if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { + *gradient = make_rows(page_tr_, to_blocks); + } else if (!PSM_SPARSE(pageseg_mode)) { + // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. + *gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks); + } else { + *gradient = 0.0f; + } + BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks); + baseline_detector.ComputeStraightBaselines(use_box_bottoms); + baseline_detector.ComputeBaselineSplinesAndXheights( + page_tr_, pageseg_mode != PSM_RAW_LINE, textord_heavy_nr, textord_show_final_rows, this); + // Now make the words in the lines. + if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { + // SINGLE_LINE uses the old word maker on the single line. + make_words(this, page_tr_, *gradient, blocks, to_blocks); + } else { + // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a + // single word, and in SINGLE_CHAR mode, all the outlines + // go in a single blob. + TO_BLOCK *to_block = to_block_it.data(); + make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), + to_block->block->row_list()); + } + // Remove empties. + cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); + TransferDiacriticsToBlockGroups(diacritic_blobs, blocks); + // Compute the margins for each row in the block, to be used later for + // paragraph detection. + BLOCK_IT b_it(blocks); + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + b_it.data()->compute_row_margins(); + } +#ifndef GRAPHICS_DISABLED + close_to_win(); +#endif +} + +// If we were supposed to return only a single textline, and there is more +// than one, clean up and leave only the best. +void Textord::CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res) { + if (PSM_LINE_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { + return; // No cleanup required. + } + PAGE_RES_IT it(page_res); + // Find the best row, being the greatest mean word conf. + float row_total_conf = 0.0f; + int row_word_count = 0; + ROW_RES *best_row = nullptr; + float best_conf = 0.0f; + for (it.restart_page(); it.word() != nullptr; it.forward()) { + WERD_RES *word = it.word(); + row_total_conf += word->best_choice->certainty(); + ++row_word_count; + if (it.next_row() != it.row()) { + row_total_conf /= row_word_count; + if (best_row == nullptr || best_conf < row_total_conf) { + best_row = it.row(); + best_conf = row_total_conf; + } + row_total_conf = 0.0f; + row_word_count = 0; + } + } + // Now eliminate any word not in the best row. + for (it.restart_page(); it.word() != nullptr; it.forward()) { + if (it.row() != best_row) { + it.DeleteCurrentWord(); + } + } +} + +} // namespace tesseract.
