Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/classify/classify.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/classify/classify.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,491 @@ +/////////////////////////////////////////////////////////////////////// +// File: classify.h +// Description: classify class. +// Author: Samuel Charron +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CLASSIFY_CLASSIFY_H_ +#define TESSERACT_CLASSIFY_CLASSIFY_H_ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#ifdef DISABLED_LEGACY_ENGINE + +# include "ccstruct.h" +# include "dict.h" + +namespace tesseract { + +class Classify : public CCStruct { +public: + Classify(); + virtual ~Classify(); + virtual Dict &getDict() { + return dict_; + } + + // Member variables. + + INT_VAR_H(classify_debug_level); + BOOL_VAR_H(classify_bln_numeric_mode); + double_VAR_H(classify_max_rating_ratio); + double_VAR_H(classify_max_certainty_margin); + +private: + Dict dict_; +}; + +} // namespace tesseract + +#else // DISABLED_LEGACY_ENGINE not defined + +# include "adaptive.h" +# include "ccstruct.h" +# include "dict.h" +# include "featdefs.h" +# include "fontinfo.h" +# include "intfx.h" +# include "intmatcher.h" +# include "normalis.h" +# include "ocrfeatures.h" +# include "ratngs.h" +# include "unicity_table.h" + +namespace tesseract { + +class ScrollView; +class WERD_CHOICE; +class WERD_RES; +struct ADAPT_RESULTS; +struct NORM_PROTOS; + +static const int kUnknownFontinfoId = -1; +static const int kBlankFontinfoId = -2; + +class ShapeClassifier; +struct ShapeRating; +class ShapeTable; +struct UnicharRating; + +// How segmented is a blob. In this enum, character refers to a classifiable +// unit, but that is too long and character is usually easier to understand. +enum CharSegmentationType { + CST_FRAGMENT, // A partial character. + CST_WHOLE, // A correctly segmented character. + CST_IMPROPER, // More than one but less than 2 characters. + CST_NGRAM // Multiple characters. +}; + +class TESS_API Classify : public CCStruct { +public: + Classify(); + ~Classify() override; + virtual Dict &getDict() { + return dict_; + } + + const ShapeTable *shape_table() const { + return shape_table_; + } + + // Takes ownership of the given classifier, and uses it for future calls + // to CharNormClassifier. + void SetStaticClassifier(ShapeClassifier *static_classifier); + + // Adds a noise classification result that is a bit worse than the worst + // current result, or the worst possible result if no current results. + void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices); + + // Returns true if the blob is small enough to be a large speckle. + bool LargeSpeckle(const TBLOB &blob); + + /* adaptive.cpp ************************************************************/ + int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId); + // Runs the class pruner from int_templates on the given features, returning + // the number of classes output in results. + // int_templates Class pruner tables + // num_features Number of features in blob + // features Array of features + // normalization_factors (input) Array of int_templates->NumClasses fudge + // factors from blob normalization process. + // (Indexed by CLASS_INDEX) + // expected_num_features (input) Array of int_templates->NumClasses + // expected number of features for each class. + // (Indexed by CLASS_INDEX) + // results (output) Sorted Array of pruned classes. + // Array must be sized to take the maximum possible + // number of outputs : int_templates->NumClasses. + int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, + const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, + const uint16_t *expected_num_features, std::vector<CP_RESULT_STRUCT> *results); + void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs); + void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates); + void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates); + ADAPT_TEMPLATES_STRUCT *ReadAdaptedTemplates(TFile *File); + /* normmatch.cpp ************************************************************/ + float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch); + void FreeNormProtos(); + NORM_PROTOS *ReadNormProtos(TFile *fp); + /* protos.cpp ***************************************************************/ + void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class); + INT_TEMPLATES_STRUCT *CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset); + /* adaptmatch.cpp ***********************************************************/ + + // Learns the given word using its chopped_word, seam_array, denorm, + // box_word, best_state, and correct_text to learn both correctly and + // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob + // is called and the data will be saved in an internal buffer. + // Otherwise AdaptToBlob is called for adaption within a document. + void LearnWord(const char *fontname, WERD_RES *word); + + // Builds a blob of length fragments, from the word, starting at start, + // and then learns it, as having the given correct_text. + // If fontname is not nullptr, then LearnBlob is called and the data will be + // saved in an internal buffer for static training. + // Otherwise AdaptToBlob is called for adaption within a document. + // threshold is a magic number required by AdaptToChar and generated by + // ComputeAdaptionThresholds. + // Although it can be partly inferred from the string, segmentation is + // provided to explicitly clarify the character segmentation. + void LearnPieces(const char *fontname, int start, int length, float threshold, + CharSegmentationType segmentation, const char *correct_text, WERD_RES *word); + void InitAdaptiveClassifier(TessdataManager *mgr); + void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, + ADAPT_TEMPLATES_STRUCT *Templates); + void AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features, + const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, + INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, + ADAPT_RESULTS *results); + void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features, + const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, + ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, + const std::vector<CP_RESULT_STRUCT> &results, ADAPT_RESULTS *final_results); + // Converts configs to fonts, and if the result is not adapted, and a + // shape_table_ is present, the shape is expanded to include all + // unichar_ids represented, before applying a set of corrections to the + // distance rating in int_result, (see ComputeCorrectedRating.) + // The results are added to the final_results output. + void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, + int top, float cp_rating, int blob_length, + int matcher_multiplier, const uint8_t *cn_factors, + UnicharRating *int_result, ADAPT_RESULTS *final_results); + // Applies a set of corrections to the distance im_rating, + // including the cn_correction, miss penalty and additional penalty + // for non-alnums being vertical misfits. Returns the corrected distance. + double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, + int feature_misses, int bottom, int top, int blob_length, + int matcher_multiplier, const uint8_t *cn_factors); + void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, + BLOB_CHOICE_LIST *Choices); + void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results); + int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures); + +# ifndef GRAPHICS_DISABLED + void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results); +# endif + PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], + INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask); + int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, + int NumFeatures, INT_FEATURE_ARRAY Features, + FEATURE_SET FloatFeatures); + void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob); + void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results); + void RemoveExtraPuncs(ADAPT_RESULTS *Results); + void RemoveBadMatches(ADAPT_RESULTS *Results); + void SetAdaptiveThreshold(float Threshold); + void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features); + // Returns a string for the classifier class_id: either the corresponding + // unicharset debug_str or the shape_table_ debug str. + std::string ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, + int config_id) const; + // Converts a classifier class_id index with a config ID to: + // shape_table_ present: a shape_table_ index OR + // No shape_table_: a font ID. + // Without shape training, each class_id, config pair represents a single + // unichar id/font combination, so this function looks up the corresponding + // font id. + // With shape training, each class_id, config pair represents a single + // shape table index, so the fontset_table stores the shape table index, + // and the shape_table_ must be consulted to obtain the actual unichar_id/ + // font combinations that the shape represents. + int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const; + // Converts a shape_table_ index to a classifier class_id index (not a + // unichar-id!). Uses a search, so not fast. + int ShapeIDToClassID(int shape_id) const; + UNICHAR_ID *BaselineClassifier(TBLOB *Blob, const std::vector<INT_FEATURE_STRUCT> &int_features, + const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, + ADAPT_RESULTS *Results); + int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results); + + // As CharNormClassifier, but operates on a TrainingSample and outputs to + // a vector of ShapeRating without conversion to classes. + int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, + std::vector<UnicharRating> *results); + UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass); + void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results); + void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, + ADAPT_TEMPLATES_STRUCT *adaptive_templates); + void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class); + bool AdaptableWord(WERD_RES *word); + void EndAdaptiveClassifier(); + void SetupPass1(); + void SetupPass2(); + void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices); + void ClassifyAsNoise(ADAPT_RESULTS *Results); + void ResetAdaptiveClassifierInternal(); + void SwitchAdaptiveClassifier(); + void StartBackupAdaptiveClassifier(); + + int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, + uint8_t *pruner_norm_array, uint8_t *char_norm_array); + // Computes the char_norm_array for the unicharset and, if not nullptr, the + // pruner_array as appropriate according to the existence of the shape_table. + // The norm_feature is deleted as it is almost certainly no longer needed. + void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, + uint8_t *char_norm_array, uint8_t *pruner_array); + + bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config); + void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob); + + bool AdaptiveClassifierIsFull() const { + return NumAdaptationsFailed > 0; + } + bool AdaptiveClassifierIsEmpty() const { + return AdaptedTemplates->NumPermClasses == 0; + } + bool LooksLikeGarbage(TBLOB *blob); +#ifndef GRAPHICS_DISABLED + void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox); +#endif + // intfx.cpp + // Computes the DENORMS for bl(baseline) and cn(character) normalization + // during feature extraction. The input denorm describes the current state + // of the blob, which is usually a baseline-normalized word. + // The Transforms setup are as follows: + // Baseline Normalized (bl) Output: + // We center the grapheme by aligning the x-coordinate of its centroid with + // x=128 and leaving the already-baseline-normalized y as-is. + // + // Character Normalized (cn) Output: + // We align the grapheme's centroid at the origin and scale it + // asymmetrically in x and y so that the 2nd moments are a standard value + // (51.2) ie the result is vaguely square. + // If classify_nonlinear_norm is true: + // A non-linear normalization is setup that attempts to evenly distribute + // edges across x and y. + // + // Some of the fields of fx_info are also setup: + // Length: Total length of outline. + // Rx: Rounded y second moment. (Reversed by convention.) + // Ry: rounded x second moment. + // Xmean: Rounded x center of mass of the blob. + // Ymean: Rounded y center of mass of the blob. + static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, + DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info); + + // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as + // (x,y) position and angle as measured counterclockwise from the vector + // <-1, 0>, from blob using two normalizations defined by bl_denorm and + // cn_denorm. See SetpuBLCNDenorms for definitions. + // If outline_cn_counts is not nullptr, on return it contains the cumulative + // number of cn features generated for each outline in the blob (in order). + // Thus after the first outline, there were (*outline_cn_counts)[0] features, + // after the second outline, there were (*outline_cn_counts)[1] features etc. + static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, + std::vector<INT_FEATURE_STRUCT> *bl_features, + std::vector<INT_FEATURE_STRUCT> *cn_features, + INT_FX_RESULT_STRUCT *results, std::vector<int> *outline_cn_counts); + /* float2int.cpp ************************************************************/ + void ClearCharNormArray(uint8_t *char_norm_array); + void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array); + void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures); + /* intproto.cpp *************************************************************/ + INT_TEMPLATES_STRUCT *ReadIntTemplates(TFile *fp); + void WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset); + CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on, + int *shape_id); + void ShowMatchDisplay(); + /* font detection ***********************************************************/ + UnicityTable<FontInfo> &get_fontinfo_table() { + return fontinfo_table_; + } + const UnicityTable<FontInfo> &get_fontinfo_table() const { + return fontinfo_table_; + } + UnicityTable<FontSet> &get_fontset_table() { + return fontset_table_; + } + /* mfoutline.cpp ***********************************************************/ + void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale); + /* outfeat.cpp ***********************************************************/ + FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob); + /* picofeat.cpp ***********************************************************/ + FEATURE_SET ExtractPicoFeatures(TBLOB *Blob); + FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info); + FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info); + /* blobclass.cpp ***********************************************************/ + // Extracts features from the given blob and saves them in the tr_file_data_ + // member variable. + // fontname: Name of font that this blob was printed in. + // cn_denorm: Character normalization transformation to apply to the blob. + // fx_info: Character normalization parameters computed with cn_denorm. + // blob_text: Ground truth text for the blob. + void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, + const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text); + // Writes stored training data to a .tr file based on the given filename. + // Returns false on error. + bool WriteTRFile(const char *filename); + + // Member variables. + + // Parameters. + // Set during training (in lang.config) to indicate whether the divisible + // blobs chopper should be used (true for latin script.) + BOOL_VAR_H(allow_blob_division); + // Set during training (in lang.config) to indicate whether the divisible + // blobs chopper should be used in preference to chopping. Set to true for + // southern Indic scripts. + BOOL_VAR_H(prioritize_division); + BOOL_VAR_H(classify_enable_learning); + INT_VAR_H(classify_debug_level); + + /* mfoutline.cpp ***********************************************************/ + /* control knobs used to control normalization of outlines */ + INT_VAR_H(classify_norm_method); + double_VAR_H(classify_char_norm_range); + double_VAR_H(classify_max_rating_ratio); + double_VAR_H(classify_max_certainty_margin); + + /* adaptmatch.cpp ***********************************************************/ + BOOL_VAR_H(tess_cn_matching); + BOOL_VAR_H(tess_bn_matching); + BOOL_VAR_H(classify_enable_adaptive_matcher); + BOOL_VAR_H(classify_use_pre_adapted_templates); + BOOL_VAR_H(classify_save_adapted_templates); + BOOL_VAR_H(classify_enable_adaptive_debugger); + BOOL_VAR_H(classify_nonlinear_norm); + INT_VAR_H(matcher_debug_level); + INT_VAR_H(matcher_debug_flags); + INT_VAR_H(classify_learning_debug_level); + double_VAR_H(matcher_good_threshold); + double_VAR_H(matcher_reliable_adaptive_result); + double_VAR_H(matcher_perfect_threshold); + double_VAR_H(matcher_bad_match_pad); + double_VAR_H(matcher_rating_margin); + double_VAR_H(matcher_avg_noise_size); + INT_VAR_H(matcher_permanent_classes_min); + INT_VAR_H(matcher_min_examples_for_prototyping); + INT_VAR_H(matcher_sufficient_examples_for_prototyping); + double_VAR_H(matcher_clustering_max_angle_delta); + double_VAR_H(classify_misfit_junk_penalty); + double_VAR_H(rating_scale); + double_VAR_H(tessedit_class_miss_scale); + double_VAR_H(classify_adapted_pruning_factor); + double_VAR_H(classify_adapted_pruning_threshold); + INT_VAR_H(classify_adapt_proto_threshold); + INT_VAR_H(classify_adapt_feature_threshold); + BOOL_VAR_H(disable_character_fragments); + double_VAR_H(classify_character_fragments_garbage_certainty_threshold); + BOOL_VAR_H(classify_debug_character_fragments); + BOOL_VAR_H(matcher_debug_separate_windows); + STRING_VAR_H(classify_learn_debug_str); + + /* intmatcher.cpp **********************************************************/ + INT_VAR_H(classify_class_pruner_threshold); + INT_VAR_H(classify_class_pruner_multiplier); + INT_VAR_H(classify_cp_cutoff_strength); + INT_VAR_H(classify_integer_matcher_multiplier); + + BOOL_VAR_H(classify_bln_numeric_mode); + double_VAR_H(speckle_large_max_size); + double_VAR_H(speckle_rating_penalty); + + // Use class variables to hold onto built-in templates and adapted templates. + INT_TEMPLATES_STRUCT *PreTrainedTemplates = nullptr; + ADAPT_TEMPLATES_STRUCT *AdaptedTemplates = nullptr; + // The backup adapted templates are created from the previous page (only) + // so they are always ready and reasonably well trained if the primary + // adapted templates become full. + ADAPT_TEMPLATES_STRUCT *BackupAdaptedTemplates = nullptr; + + // Create dummy proto and config masks for use with the built-in templates. + BIT_VECTOR AllProtosOn = nullptr; + BIT_VECTOR AllConfigsOn = nullptr; + BIT_VECTOR AllConfigsOff = nullptr; + BIT_VECTOR TempProtoMask = nullptr; + /* normmatch.cpp */ + NORM_PROTOS *NormProtos = nullptr; + /* font detection ***********************************************************/ + UnicityTable<FontInfo> fontinfo_table_; + // Without shape training, each class_id, config pair represents a single + // unichar id/font combination, so each fontset_table_ entry holds font ids + // for each config in the class. + // With shape training, each class_id, config pair represents a single + // shape_table_ index, so the fontset_table_ stores the shape_table_ index, + // and the shape_table_ must be consulted to obtain the actual unichar_id/ + // font combinations that the shape represents. + UnicityTable<FontSet> fontset_table_; + +protected: + IntegerMatcher im_; + FEATURE_DEFS_STRUCT feature_defs_; + // If a shape_table_ is present, it is used to remap classifier output in + // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually + // mean an index to the shape_table_ and the choices returned are *all* the + // shape_table_ entries at that index. + ShapeTable *shape_table_ = nullptr; + +private: + // The currently active static classifier. + ShapeClassifier *static_classifier_ = nullptr; +#ifndef GRAPHICS_DISABLED + ScrollView *learn_debug_win_ = nullptr; + ScrollView *learn_fragmented_word_debug_win_ = nullptr; + ScrollView *learn_fragments_debug_win_ = nullptr; +#endif + + // Training data gathered here for all the images in a document. + std::string tr_file_data_; + + Dict dict_; + + std::vector<uint16_t> shapetable_cutoffs_; + + /* variables used to hold performance statistics */ + int NumAdaptationsFailed = 0; + + // Expected number of features in the class pruner, used to penalize + // unknowns that have too few features (like a c being classified as e) so + // it doesn't recognize everything as '@' or '#'. + // CharNormCutoffs is for the static classifier (with no shapetable). + // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real + // value in the adaptive classifier. Both are indexed by unichar_id. + // shapetable_cutoffs_ provides a similar value for each shape in the + // shape_table_ + uint16_t CharNormCutoffs[MAX_NUM_CLASSES]; + uint16_t BaselineCutoffs[MAX_NUM_CLASSES]; + +public: + bool EnableLearning = true; +}; + +} // namespace tesseract + +#endif // DISABLED_LEGACY_ENGINE + +#endif // TESSERACT_CLASSIFY_CLASSIFY_H_
