Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/wordrec.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/wordrec.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,452 @@ +/////////////////////////////////////////////////////////////////////// +// File: wordrec.h +// Description: wordrec class. +// Author: Samuel Charron +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_WORDREC_WORDREC_H_ +#define TESSERACT_WORDREC_WORDREC_H_ + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" // DISABLED_LEGACY_ENGINE +#endif + +#ifdef DISABLED_LEGACY_ENGINE + +# include <cstdint> // for int16_t, int32_t +# include "classify.h" // for Classify +# include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP... +# include "ratngs.h" // for WERD_CHOICE + +namespace tesseract { +class TessdataManager; +} + +namespace tesseract { + +/* ccmain/tstruct.cpp */ + +class TESS_API Wordrec : public Classify { +public: + // config parameters + + BOOL_VAR_H(wordrec_debug_blamer); + BOOL_VAR_H(wordrec_run_blamer); + + // methods + Wordrec(); + virtual ~Wordrec() = default; + + // tface.cpp + void program_editup(const std::string &textbase, TessdataManager *init_classifier, + TessdataManager *init_dict); + void program_editdown(int32_t elapsed_time); + int end_recog(); + int dict_word(const WERD_CHOICE &word); + + // Member variables + WERD_CHOICE *prev_word_best_choice_; +}; + +} // namespace tesseract + +#else // DISABLED_LEGACY_ENGINE not defined + +# include <memory> +# include "associate.h" +# include "chop.h" // for PointHeap, MAX_NUM_POINTS +# include "classify.h" // for Classify +# include "dict.h" +# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK +# include "findseam.h" // for SeamQueue, SeamPile +# include "language_model.h" +# include "matrix.h" +# include "oldlist.h" // for LIST +# include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP... +# include "points.h" // for ICOORD +# include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI... +# include "seam.h" // for SEAM (ptr only), PRIORITY +# include "stopper.h" // for DANGERR + +# include <cstdint> // for int16_t, int32_t + +namespace tesseract { + +class EDGEPT_CLIST; +class MATRIX; +class TBOX; +class UNICHARSET; +class WERD_RES; + +class LMPainPoints; +class TessdataManager; +struct BestChoiceBundle; + +struct BlamerBundle; +struct EDGEPT; +struct MATRIX_COORD; +struct SPLIT; +struct TBLOB; +struct TESSLINE; +struct TWERD; + +// A class for storing which nodes are to be processed by the segmentation +// search. There is a single SegSearchPending for each column in the ratings +// matrix, and it indicates whether the segsearch should combine all +// BLOB_CHOICES in the column, or just the given row with the parents +// corresponding to *this SegSearchPending, and whether only updated parent +// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs. +class SegSearchPending { +public: + SegSearchPending() + : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {} + + // Marks the whole column as just classified. Used to start a search on + // a newly initialized ratings matrix. + void SetColumnClassified() { + column_classified_ = true; + } + // Marks the matrix entry at the given row as just classified. + // Used after classifying a new matrix cell. + // Additional to, not overriding a previous RevisitWholeColumn. + void SetBlobClassified(int row) { + classified_row_ = row; + } + // Marks the whole column as needing work, but not just classified. + // Used when the parent vse list is updated. + // Additional to, not overriding a previous SetBlobClassified. + void RevisitWholeColumn() { + revisit_whole_column_ = true; + } + + // Clears *this to indicate no work to do. + void Clear() { + classified_row_ = -1; + revisit_whole_column_ = false; + column_classified_ = false; + } + + // Returns true if there are updates to do in the column that *this + // represents. + bool WorkToDo() const { + return revisit_whole_column_ || column_classified_ || classified_row_ >= 0; + } + // Returns true if the given row was just classified. + bool IsRowJustClassified(int row) const { + return row == classified_row_ || column_classified_; + } + // Returns the single row to process if there is only one, otherwise -1. + int SingleRow() const { + return revisit_whole_column_ || column_classified_ ? -1 : classified_row_; + } + +private: + // If non-negative, indicates the single row in the ratings matrix that has + // just been classified, and so should be combined with all the parents in the + // column that this SegSearchPending represents. + // Operates independently of revisit_whole_column. + int classified_row_; + // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will + // be processed, but classified_row can indicate a row that is newly + // classified. Overridden if column_classified is true. + bool revisit_whole_column_; + // If column_classified is true, parent vses are processed with all rows + // regardless of whether they are just updated, overriding + // revisit_whole_column and classified_row. + bool column_classified_; +}; + +/* ccmain/tstruct.cpp *********************************************************/ +class FRAGMENT : public ELIST_LINK { +public: + FRAGMENT() { // constructor + } + FRAGMENT(EDGEPT *head_pt, // start + EDGEPT *tail_pt); // end + + ICOORD head; // coords of start + ICOORD tail; // coords of end + EDGEPT *headpt; // start point + EDGEPT *tailpt; // end point +}; +ELISTIZEH(FRAGMENT) + +class TESS_API Wordrec : public Classify { +public: + // config parameters ******************************************************* + BOOL_VAR_H(merge_fragments_in_matrix); + BOOL_VAR_H(wordrec_enable_assoc); + BOOL_VAR_H(force_word_assoc); + INT_VAR_H(repair_unchopped_blobs); + double_VAR_H(tessedit_certainty_threshold); + INT_VAR_H(chop_debug); + BOOL_VAR_H(chop_enable); + BOOL_VAR_H(chop_vertical_creep); + INT_VAR_H(chop_split_length); + INT_VAR_H(chop_same_distance); + INT_VAR_H(chop_min_outline_points); + INT_VAR_H(chop_seam_pile_size); + BOOL_VAR_H(chop_new_seam_pile); + INT_VAR_H(chop_inside_angle); + INT_VAR_H(chop_min_outline_area); + double_VAR_H(chop_split_dist_knob); + double_VAR_H(chop_overlap_knob); + double_VAR_H(chop_center_knob); + INT_VAR_H(chop_centered_maxwidth); + double_VAR_H(chop_sharpness_knob); + double_VAR_H(chop_width_change_knob); + double_VAR_H(chop_ok_split); + double_VAR_H(chop_good_split); + INT_VAR_H(chop_x_y_weight); + BOOL_VAR_H(assume_fixed_pitch_char_segment); + INT_VAR_H(wordrec_debug_level); + INT_VAR_H(wordrec_max_join_chunks); + BOOL_VAR_H(wordrec_skip_no_truth_words); + BOOL_VAR_H(wordrec_debug_blamer); + BOOL_VAR_H(wordrec_run_blamer); + INT_VAR_H(segsearch_debug_level); + INT_VAR_H(segsearch_max_pain_points); + INT_VAR_H(segsearch_max_futile_classifications); + double_VAR_H(segsearch_max_char_wh_ratio); + BOOL_VAR_H(save_alt_choices); + + // methods from wordrec/*.cpp *********************************************** + Wordrec(); + ~Wordrec() override = default; + + // Fills word->alt_choices with alternative paths found during + // chopping/segmentation search that are kept in best_choices. + void SaveAltChoices(const LIST &best_choices, WERD_RES *word); + + // Fills character choice lattice in the given BlamerBundle + // using the given ratings matrix and best choice list. + void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, + const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); + + // Calls fill_lattice_ member function + // (assumes that fill_lattice_ is not nullptr). + void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, + const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) { + (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle); + } + + // tface.cpp + void program_editup(const std::string &textbase, TessdataManager *init_classifier, + TessdataManager *init_dict); + void cc_recog(WERD_RES *word); + void program_editdown(int32_t elapsed_time); + void set_pass1(); + void set_pass2(); + int end_recog(); + BLOB_CHOICE_LIST *call_matcher(TBLOB *blob); + int dict_word(const WERD_CHOICE &word); + // wordclass.cpp + BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color, + BlamerBundle *blamer_bundle); + + // segsearch.cpp + // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs. + // Each entry in the matrix represents the classification choice + // for a chunk, i.e. an entry in row 2, column 1 represents the list + // of ratings for the chunks 1 and 2 classified as a single blob. + // The entries on the diagonal of the matrix are classifier choice lists + // for a single chunk from the maximal segmentation. + // + // The ratings matrix given to SegSearch represents the segmentation + // graph / trellis for the current word. The nodes in the graph are the + // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings + // matrix. The children of each node (nodes connected by outgoing links) + // are the entries in the column that is equal to node's row+1. The parents + // (nodes connected by the incoming links) are the entries in the row that + // is equal to the node's column-1. Here is an example ratings matrix: + // + // 0 1 2 3 4 + // ------------------------- + // 0| c,( | + // 1| d l,1 | + // 2| o | + // 3| c,( | + // 4| g,y l,1 | + // ------------------------- + // + // In the example above node "o" has children (outgoing connection to nodes) + // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d". + // + // The objective of the search is to find the least cost path, where the cost + // is determined by the language model components and the properties of the + // cut between the blobs on the path. SegSearch starts by populating the + // matrix with the all the entries that were classified by the chopper and + // finding the initial best path. Based on the classifier ratings, language + // model scores and the properties of each cut, a list of "pain points" is + // constructed - those are the points on the path where the choices do not + // look consistent with the neighboring choices, the cuts look particularly + // problematic, or the certainties of the blobs are low. The most troublesome + // "pain point" is picked from the list and the new entry in the ratings + // matrix corresponding to this "pain point" is filled in. Then the language + // model state is updated to reflect the new classification and the new + // "pain points" are added to the list and the next most troublesome + // "pain point" is determined. This continues until either the word choice + // composed from the best paths in the segmentation graph is "good enough" + // (e.g. above a certain certainty threshold, is an unambiguous dictionary + // word, etc) or there are no more "pain points" to explore. + // + // If associate_blobs is set to false no new classifications will be done + // to combine blobs. Segmentation search will run only one "iteration" + // on the classifications already recorded in chunks_record.ratings. + // + // Note: this function assumes that word_res, best_choice_bundle arguments + // are not nullptr. + void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, + BlamerBundle *blamer_bundle); + + // Setup and run just the initial segsearch on an established matrix, + // without doing any additional chopping or joining. + // (Internal factored version that can be used as part of the main SegSearch.) + void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, + std::vector<SegSearchPending> *pending, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); + + // chop.cpp + PRIORITY point_priority(EDGEPT *point); + void add_point_to_list(PointHeap *point_heap, EDGEPT *point); + // Returns true if the edgept supplied as input is an inside angle. This + // is determined by the angular change of the vectors from point to point. + bool is_inside_angle(EDGEPT *pt); + int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3); + EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist); + void prioritize_points(TESSLINE *outline, PointHeap *points); + void new_min_point(EDGEPT *local_min, PointHeap *points); + void new_max_point(EDGEPT *local_max, PointHeap *points); + void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, + EDGEPT_CLIST *new_points); + + // chopper.cpp + SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, + const std::vector<SEAM *> &seams); + SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, + const std::vector<SEAM *> &seams); + SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res, + unsigned *blob_number); + SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, + bool split_next_to_fragment, bool italic_blob, WERD_RES *word, + unsigned *blob_number); + SEAM *chop_one_blob(const std::vector<TBOX> &boxes, + const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, + unsigned *blob_number); + void chop_word_main(WERD_RES *word); + void improve_by_chopping(float rating_cert_scale, WERD_RES *word, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, + LMPainPoints *pain_points, std::vector<SegSearchPending> *pending); + int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling, + bool split_next_to_fragment); + int select_blob_to_split_from_fixpt(DANGERR *fixpt); + + // findseam.cpp + void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams); + void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, + SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile); + void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue); + SEAM *pick_good_seam(TBLOB *blob); + void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, + SeamPile *seam_pile, SEAM **seam, TBLOB *blob); + void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, + EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, + SEAM **seam, TBLOB *blob); + + // gradechop.cpp + PRIORITY grade_split_length(SPLIT *split); + PRIORITY grade_sharpness(SPLIT *split); + + // outlines.cpp + bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt); + + // pieces.cpp + virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start, + int16_t end, const char *description, TWERD *word, + BlamerBundle *blamer_bundle); + + // Member variables. + + std::unique_ptr<LanguageModel> language_model_; + PRIORITY pass2_ok_split; + // Stores the best choice for the previous word in the paragraph. + // This variable is modified by PAGE_RES_IT when iterating over + // words to OCR on the page. + WERD_CHOICE *prev_word_best_choice_; + + // Function used to fill char choice lattices. + void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, + const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); + +protected: + inline bool SegSearchDone(int num_futile_classifications) { + return (language_model_->AcceptableChoiceFound() || + num_futile_classifications >= segsearch_max_futile_classifications); + } + + // Updates the language model state recorded for the child entries specified + // in pending[starting_col]. Enqueues the children of the updated entries + // into pending and proceeds to update (and remove from pending) all the + // remaining entries in pending[col] (col >= starting_col). Upon termination + // of this function all the pending[col] lists will be empty. + // + // The arguments: + // + // starting_col: index of the column in chunks_record->ratings from + // which the update should be started + // + // pending: list of entries listing chunks_record->ratings entries + // that should be updated + // + // pain_points: priority heap listing the pain points generated by + // the language model + // + // temp_pain_points: temporary storage for tentative pain points generated + // by the language model after a single call to LanguageModel::UpdateState() + // (the argument is passed in rather than created before each + // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation) + // + // best_choice_bundle: a collection of variables that should be updated + // if a new best choice is found + // + void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, + std::vector<SegSearchPending> *pending, WERD_RES *word_res, + LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, + BlamerBundle *blamer_bundle); + + // Process the given pain point: classify the corresponding blob, enqueue + // new pain points to join the newly classified blob with its neighbors. + void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, + const char *pain_point_type, + std::vector<SegSearchPending> *pending, WERD_RES *word_res, + LMPainPoints *pain_points, BlamerBundle *blamer_bundle); + // Resets enough of the results so that the Viterbi search is re-run. + // Needed when the n-gram model is enabled, as the multi-length comparison + // implementation will re-value existing paths to worse values. + void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, + std::vector<SegSearchPending> &pending); + + // Add pain points for classifying blobs on the correct segmentation path + // (so that we can evaluate correct segmentation path and discover the reason + // for incorrect result). + void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, + BlamerBundle *blamer_bundle, std::string &blamer_debug); +}; + +} // namespace tesseract + +#endif // DISABLED_LEGACY_ENGINE + +#endif // TESSERACT_WORDREC_WORDREC_H_
