Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/textord.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/textord.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,320 @@ +/////////////////////////////////////////////////////////////////////// +// File: textord.h +// Description: The Textord class definition gathers text line and word +// finding functionality. +// Author: Ray Smith +// Created: Fri Mar 13 14:29:01 PDT 2009 +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_TEXTORD_TEXTORD_H_ +#define TESSERACT_TEXTORD_TEXTORD_H_ + +#include "bbgrid.h" +#include "blobbox.h" +#include "ccstruct.h" +#include "gap_map.h" + +#include <tesseract/publictypes.h> // For PageSegMode. + +namespace tesseract { + +class FCOORD; +class BLOCK_LIST; +class PAGE_RES; +class TO_BLOCK; +class TO_BLOCK_LIST; +class ScrollView; + +// A simple class that can be used by BBGrid to hold a word and an expanded +// bounding box that makes it easy to find words to put diacritics. +class WordWithBox { +public: + WordWithBox() : word_(nullptr) {} + explicit WordWithBox(WERD *word) : word_(word), bounding_box_(word->bounding_box()) { + int height = bounding_box_.height(); + bounding_box_.pad(height, height); + } + + const TBOX &bounding_box() const { + return bounding_box_; + } + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const { + return word_->true_bounding_box(); + } + C_BLOB_LIST *RejBlobs() const { + return word_->rej_cblob_list(); + } + const WERD *word() const { + return word_; + } + +private: + // Borrowed pointer to a real word somewhere that must outlive this class. + WERD *word_; + // Cached expanded bounding box of the word, padded all round by its height. + TBOX bounding_box_; +}; + +// Make it usable by BBGrid. +CLISTIZEH(WordWithBox) +using WordGrid = BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT>; +using WordSearch = GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT>; + +class Textord { +public: + explicit Textord(CCStruct *ccstruct); + ~Textord() = default; + + // Make the textlines and words inside each block. + // binary_pix is mandatory and is the binarized input after line removal. + // grey_pix is optional, but if present must match the binary_pix in size, + // and must be a *real* grey image instead of binary_pix * 255. + // thresholds_pix is expected to be present iff grey_pix is present and + // can be an integer factor reduction of the grey_pix. It represents the + // thresholds that were used to create the binary_pix from the grey_pix. + // diacritic_blobs contain small confusing components that should be added + // to the appropriate word(s) in case they are really diacritics. + void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, + Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms, + BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, + float *gradient); + + // If we were supposed to return only a single textline, and there is more + // than one, clean up and leave only the best. + void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res); + + bool use_cjk_fp_model() const { + return use_cjk_fp_model_; + } + void set_use_cjk_fp_model(bool flag) { + use_cjk_fp_model_ = flag; + } + + // tospace.cpp /////////////////////////////////////////// + void to_spacing(ICOORD page_tr, // topright of page + TO_BLOCK_LIST *blocks // blocks on page + ); + ROW *make_prop_words(TO_ROW *row, // row to make + FCOORD rotation // for drawing + ); + ROW *make_blob_words(TO_ROW *row, // row to make + FCOORD rotation // for drawing + ); + // tordmain.cpp /////////////////////////////////////////// + void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks); + void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on); + +private: + // For underlying memory management and other utilities. + CCStruct *ccstruct_; + + // The size of the input image. + ICOORD page_tr_; + + bool use_cjk_fp_model_; + + // makerow.cpp /////////////////////////////////////////// + // Make the textlines inside each block. + void MakeRows(PageSegMode pageseg_mode, const FCOORD &skew, int width, int height, + TO_BLOCK_LIST *to_blocks); + // Make the textlines inside a single block. + void MakeBlockRows(int min_spacing, int max_spacing, const FCOORD &skew, TO_BLOCK *block, + ScrollView *win); + +public: + void compute_block_xheight(TO_BLOCK *block, float gradient); + void compute_row_xheight(TO_ROW *row, // row to do + const FCOORD &rotation, + float gradient, // global skew + int block_line_size); + void make_spline_rows(TO_BLOCK *block, // block to do + float gradient, // gradient to fit + bool testing_on); + +private: + //// oldbasel.cpp //////////////////////////////////////// + void make_old_baselines(TO_BLOCK *block, // block to do + bool testing_on, // correct orientation + float gradient); + void correlate_lines(TO_BLOCK *block, float gradient); + void correlate_neighbours(TO_BLOCK *block, // block rows are in. + TO_ROW **rows, // rows of block. + int rowcount); // no of rows to do. + int correlate_with_stats(TO_ROW **rows, // rows of block. + int rowcount, // no of rows to do. + TO_BLOCK *block); + void find_textlines(TO_BLOCK *block, // block row is in + TO_ROW *row, // row to do + int degree, // required approximation + QSPLINE *spline); // starting spline + // tospace.cpp /////////////////////////////////////////// + // DEBUG USE ONLY + void block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional, + // resulting estimate + int16_t &block_space_gap_width, + // resulting estimate + int16_t &block_non_space_gap_width); + void row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx, + // estimate for block + int16_t block_space_gap_width, + // estimate for block + int16_t block_non_space_gap_width); + void old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats, + STATS *small_gap_stats, int16_t block_space_gap_width, + // estimate for block + int16_t block_non_space_gap_width); + bool isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, bool suspected_table, + int16_t block_idx, int16_t row_idx); + int16_t stats_count_under(STATS *stats, int16_t threshold); + void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats); + bool make_a_word_break(TO_ROW *row, // row being made + TBOX blob_box, // for next_blob // how many blanks? + int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap, + int16_t within_xht_current_gap, TBOX next_blob_box, int16_t next_gap, + uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non, + bool &prev_gap_was_a_space, bool &break_at_next_gap); + bool narrow_blob(TO_ROW *row, TBOX blob_box); + bool wide_blob(TO_ROW *row, TBOX blob_box); + bool suspected_punct_blob(TO_ROW *row, TBOX box); + void peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box, int16_t &next_gap, + int16_t &next_within_xht_gap); + void mark_gap(TBOX blob, // blob following gap + int16_t rule, // heuristic id + int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap, + int16_t next_blob_width, int16_t next_gap); + float find_mean_blob_spacing(WERD *word); + bool ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left, int16_t right); + // get bounding box + TBOX reduced_box_next(TO_ROW *row, // current row + BLOBNBOX_IT *it // iterator to blobds + ); + TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht); + // tordmain.cpp /////////////////////////////////////////// + float filter_noise_blobs(BLOBNBOX_LIST *src_list, BLOBNBOX_LIST *noise_list, + BLOBNBOX_LIST *small_list, BLOBNBOX_LIST *large_list); + // Fixes the block so it obeys all the rules: + // Must have at least one ROW. + // Must have at least one WERD. + // WERDs contain a fake blob. + void cleanup_nontext_block(BLOCK *block); + void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks); + bool clean_noise_from_row(ROW *row); + void clean_noise_from_words(ROW *row); + // Remove outlines that are a tiny fraction in either width or height + // of the word height. + void clean_small_noise_from_words(ROW *row); + // Groups blocks by rotation, then, for each group, makes a WordGrid and calls + // TransferDiacriticsToWords to copy the diacritic blobs to the most + // appropriate words in the group of blocks. Source blobs are not touched. + void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks); + // Places a copy of blobs that are near a word (after applying rotation to the + // blob) in the most appropriate word, unless there is doubt, in which case a + // blob can end up in two words. Source blobs are not touched. + void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation, + WordGrid *word_grid); + +public: + // makerow.cpp /////////////////////////////////////////// + BOOL_VAR_H(textord_single_height_mode); + // tospace.cpp /////////////////////////////////////////// + BOOL_VAR_H(tosp_old_to_method); + BOOL_VAR_H(tosp_old_to_constrain_sp_kn); + BOOL_VAR_H(tosp_only_use_prop_rows); + BOOL_VAR_H(tosp_force_wordbreak_on_punct); + BOOL_VAR_H(tosp_use_pre_chopping); + BOOL_VAR_H(tosp_old_to_bug_fix); + BOOL_VAR_H(tosp_block_use_cert_spaces); + BOOL_VAR_H(tosp_row_use_cert_spaces); + BOOL_VAR_H(tosp_narrow_blobs_not_cert); + BOOL_VAR_H(tosp_row_use_cert_spaces1); + BOOL_VAR_H(tosp_recovery_isolated_row_stats); + BOOL_VAR_H(tosp_only_small_gaps_for_kern); + BOOL_VAR_H(tosp_all_flips_fuzzy); + BOOL_VAR_H(tosp_fuzzy_limit_all); + BOOL_VAR_H(tosp_stats_use_xht_gaps); + BOOL_VAR_H(tosp_use_xht_gaps); + BOOL_VAR_H(tosp_only_use_xht_gaps); + BOOL_VAR_H(tosp_rule_9_test_punct); + BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp); + BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn); + BOOL_VAR_H(tosp_improve_thresh); + INT_VAR_H(tosp_debug_level); + INT_VAR_H(tosp_enough_space_samples_for_median); + INT_VAR_H(tosp_redo_kern_limit); + INT_VAR_H(tosp_few_samples); + INT_VAR_H(tosp_short_row); + INT_VAR_H(tosp_sanity_method); + double_VAR_H(tosp_old_sp_kn_th_factor); + double_VAR_H(tosp_threshold_bias1); + double_VAR_H(tosp_threshold_bias2); + double_VAR_H(tosp_narrow_fraction); + double_VAR_H(tosp_narrow_aspect_ratio); + double_VAR_H(tosp_wide_fraction); + double_VAR_H(tosp_wide_aspect_ratio); + double_VAR_H(tosp_fuzzy_space_factor); + double_VAR_H(tosp_fuzzy_space_factor1); + double_VAR_H(tosp_fuzzy_space_factor2); + double_VAR_H(tosp_gap_factor); + double_VAR_H(tosp_kern_gap_factor1); + double_VAR_H(tosp_kern_gap_factor2); + double_VAR_H(tosp_kern_gap_factor3); + double_VAR_H(tosp_ignore_big_gaps); + double_VAR_H(tosp_ignore_very_big_gaps); + double_VAR_H(tosp_rep_space); + double_VAR_H(tosp_enough_small_gaps); + double_VAR_H(tosp_table_kn_sp_ratio); + double_VAR_H(tosp_table_xht_sp_ratio); + double_VAR_H(tosp_table_fuzzy_kn_sp_ratio); + double_VAR_H(tosp_fuzzy_kn_fraction); + double_VAR_H(tosp_fuzzy_sp_fraction); + double_VAR_H(tosp_min_sane_kn_sp); + double_VAR_H(tosp_init_guess_kn_mult); + double_VAR_H(tosp_init_guess_xht_mult); + double_VAR_H(tosp_max_sane_kn_thresh); + double_VAR_H(tosp_flip_caution); + double_VAR_H(tosp_large_kerning); + double_VAR_H(tosp_dont_fool_with_small_kerns); + double_VAR_H(tosp_near_lh_edge); + double_VAR_H(tosp_silly_kn_sp_gap); + double_VAR_H(tosp_pass_wide_fuzz_sp_to_context); + // tordmain.cpp /////////////////////////////////////////// + BOOL_VAR_H(textord_no_rejects); + BOOL_VAR_H(textord_show_blobs); + BOOL_VAR_H(textord_show_boxes); + INT_VAR_H(textord_max_noise_size); + INT_VAR_H(textord_baseline_debug); + double_VAR_H(textord_noise_area_ratio); + double_VAR_H(textord_initialx_ile); + double_VAR_H(textord_initialasc_ile); + INT_VAR_H(textord_noise_sizefraction); + double_VAR_H(textord_noise_sizelimit); + INT_VAR_H(textord_noise_translimit); + double_VAR_H(textord_noise_normratio); + BOOL_VAR_H(textord_noise_rejwords); + BOOL_VAR_H(textord_noise_rejrows); + double_VAR_H(textord_noise_syfract); + double_VAR_H(textord_noise_sxfract); + double_VAR_H(textord_noise_hfract); + INT_VAR_H(textord_noise_sncount); + double_VAR_H(textord_noise_rowratio); + BOOL_VAR_H(textord_noise_debug); + double_VAR_H(textord_blshift_maxshift); + double_VAR_H(textord_blshift_xfraction); +}; + +} // namespace tesseract + +#endif // TESSERACT_TEXTORD_TEXTORD_H_
