Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.h comparison

comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.h @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:44:09 +0200
parents	b50eed0cc0ef
children

comparison

equal deleted inserted replaced

-:6015a75abc2d
+:2c135c81b16c
+/**********************************************************************
+* File:        ratngs.h  (Formerly ratings.h)
+* Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
+* Author:      Ray Smith
+*
+* (C) Copyright 1992, Hewlett-Packard Ltd.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+**********************************************************************/
+#ifndef RATNGS_H
+#define RATNGS_H
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+#include "clst.h"
+#include "elst.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "fontinfo.h"
+#endif // undef DISABLED_LEGACY_ENGINE
+#include "matrix.h"
+#include "unicharset.h"
+#include "werd.h"
+#include <tesseract/unichar.h>
+#include <cassert>
+#include <cfloat> // for FLT_MAX
+namespace tesseract {
+class MATRIX;
+struct TBLOB;
+struct TWERD;
+// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
+// whether a blob has been classified by inspecting the BLOB_CHOICEs.
+enum BlobChoiceClassifier {
+BCC_STATIC_CLASSIFIER,  // From the char_norm classifier.
+BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
+BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
+BCC_AMBIG,              // Generated by ambiguity detection.
+BCC_FAKE,               // From some other process.
+};
+class BLOB_CHOICE : public ELIST_LINK {
+public:
+BLOB_CHOICE() {
+unichar_id_ = UNICHAR_SPACE;
+fontinfo_id_ = -1;
+fontinfo_id2_ = -1;
+rating_ = 10.0f;
+certainty_ = -1.0f;
+script_id_ = -1;
+min_xheight_ = 0.0f;
+max_xheight_ = 0.0f;
+yshift_ = 0.0f;
+classifier_ = BCC_FAKE;
+}
+BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
+float src_rating,          // rating
+float src_cert,            // certainty
+int script_id,             // script
+float min_xheight,         // min xheight in image pixel units
+float max_xheight,         // max xheight allowed by this char
+float yshift,              // the larger of y shift (top or bottom)
+BlobChoiceClassifier c);   // adapted match or other
+BLOB_CHOICE(const BLOB_CHOICE &other);
+~BLOB_CHOICE() = default;
+UNICHAR_ID unichar_id() const {
+return unichar_id_;
+}
+float rating() const {
+return rating_;
+}
+float certainty() const {
+return certainty_;
+}
+int16_t fontinfo_id() const {
+return fontinfo_id_;
+}
+int16_t fontinfo_id2() const {
+return fontinfo_id2_;
+}
+#ifndef DISABLED_LEGACY_ENGINE
+const std::vector<ScoredFont> &fonts() const {
+return fonts_;
+}
+void set_fonts(const std::vector<ScoredFont> &fonts) {
+fonts_ = fonts;
+int score1 = 0, score2 = 0;
+fontinfo_id_ = -1;
+fontinfo_id2_ = -1;
+for (auto &f : fonts_) {
+if (f.score > score1) {
+score2 = score1;
+fontinfo_id2_ = fontinfo_id_;
+score1 = f.score;
+fontinfo_id_ = f.fontinfo_id;
+} else if (f.score > score2) {
+score2 = f.score;
+fontinfo_id2_ = f.fontinfo_id;
+}
+}
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+int script_id() const {
+return script_id_;
+}
+const MATRIX_COORD &matrix_cell() {
+return matrix_cell_;
+}
+float min_xheight() const {
+return min_xheight_;
+}
+float max_xheight() const {
+return max_xheight_;
+}
+float yshift() const {
+return yshift_;
+}
+BlobChoiceClassifier classifier() const {
+return classifier_;
+}
+bool IsAdapted() const {
+return classifier_ == BCC_ADAPTED_CLASSIFIER;
+}
+bool IsClassified() const {
+return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER ||
+classifier_ == BCC_SPECKLE_CLASSIFIER;
+}
+void set_unichar_id(UNICHAR_ID newunichar_id) {
+unichar_id_ = newunichar_id;
+}
+void set_rating(float newrat) {
+rating_ = newrat;
+}
+void set_certainty(float newrat) {
+certainty_ = newrat;
+}
+void set_script(int newscript_id) {
+script_id_ = newscript_id;
+}
+void set_matrix_cell(int col, int row) {
+matrix_cell_.col = col;
+matrix_cell_.row = row;
+}
+void set_classifier(BlobChoiceClassifier classifier) {
+classifier_ = classifier;
+}
+static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) {
+auto *choice = new BLOB_CHOICE;
+*choice = *src;
+return choice;
+}
+// Returns true if *this and other agree on the baseline and x-height
+// to within some tolerance based on a given estimate of the x-height.
+bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const;
+void print(const UNICHARSET *unicharset) const {
+tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
+static_cast<double>(rating_),
+static_cast<double>(certainty_),
+static_cast<double>(min_xheight_),
+static_cast<double>(max_xheight_),
+unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str());
+}
+void print_full() const {
+print(nullptr);
+tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_,
+fontinfo_id2_, static_cast<double>(yshift_), classifier_);
+}
+// Sort function for sorting BLOB_CHOICEs in increasing order of rating.
+static int SortByRating(const void *p1, const void *p2) {
+const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1);
+const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2);
+return (bc1->rating_ < bc2->rating_) ? -1 : 1;
+}
+private:
+// Copy assignment operator.
+BLOB_CHOICE &operator=(const BLOB_CHOICE &other);
+UNICHAR_ID unichar_id_; // unichar id
+#ifndef DISABLED_LEGACY_ENGINE
+// Fonts and scores. Allowed to be empty.
+std::vector<ScoredFont> fonts_;
+#endif                   // ndef DISABLED_LEGACY_ENGINE
+int16_t fontinfo_id_;  // char font information
+int16_t fontinfo_id2_; // 2nd choice font information
+// Rating is the classifier distance weighted by the length of the outline
+// in the blob. In terms of probability, classifier distance is -klog p such
+// that the resulting distance is in the range [0, 1] and then
+// rating = w (-k log p) where w is the weight for the length of the outline.
+// Sums of ratings may be compared meaningfully for words of different
+// segmentation.
+float rating_; // size related
+// Certainty is a number in [-20, 0] indicating the classifier certainty
+// of the choice. In terms of probability, certainty = 20 (k log p) where
+// k is defined as above to normalize -klog p to the range [0, 1].
+float certainty_; // absolute
+int script_id_;
+// Holds the position of this choice in the ratings matrix.
+// Used to location position in the matrix during path backtracking.
+MATRIX_COORD matrix_cell_;
+// X-height range (in image pixels) that this classification supports.
+float min_xheight_;
+float max_xheight_;
+// yshift_ - The vertical distance (in image pixels) the character is
+//           shifted (up or down) from an acceptable y position.
+float yshift_;
+BlobChoiceClassifier classifier_; // What generated *this.
+};
+// Make BLOB_CHOICE listable.
+ELISTIZEH(BLOB_CHOICE)
+// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
+// or nullptr if there is no match.
+BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
+// Permuter codes used in WERD_CHOICEs.
+enum PermuterType {
+NO_PERM,           // 0
+PUNC_PERM,         // 1
+TOP_CHOICE_PERM,   // 2
+LOWER_CASE_PERM,   // 3
+UPPER_CASE_PERM,   // 4
+NGRAM_PERM,        // 5
+NUMBER_PERM,       // 6
+USER_PATTERN_PERM, // 7
+SYSTEM_DAWG_PERM,  // 8
+DOC_DAWG_PERM,     // 9
+USER_DAWG_PERM,    // 10
+FREQ_DAWG_PERM,    // 11
+COMPOUND_PERM,     // 12
+NUM_PERMUTER_TYPES
+};
+// ScriptPos tells whether a character is subscript, superscript or normal.
+enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP };
+const char *ScriptPosToString(ScriptPos script_pos);
+class TESS_API WERD_CHOICE : public ELIST_LINK {
+public:
+static const float kBadRating;
+static const char *permuter_name(uint8_t permuter);
+WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) {
+this->init(8);
+}
+WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) {
+this->init(reserved);
+}
+WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating,
+float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
+: unicharset_(&unicharset) {
+this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);
+}
+WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
+WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) {
+this->init(word.length());
+this->operator=(word);
+}
+~WERD_CHOICE();
+const UNICHARSET *unicharset() const {
+return unicharset_;
+}
+bool empty() const {
+return length_ == 0;
+}
+inline unsigned length() const {
+return length_;
+}
+float adjust_factor() const {
+return adjust_factor_;
+}
+void set_adjust_factor(float factor) {
+adjust_factor_ = factor;
+}
+inline const std::vector<UNICHAR_ID> &unichar_ids() const {
+return unichar_ids_;
+}
+inline UNICHAR_ID unichar_id(unsigned index) const {
+assert(index < length_);
+return unichar_ids_[index];
+}
+inline unsigned state(unsigned index) const {
+return state_[index];
+}
+ScriptPos BlobPosition(unsigned index) const {
+if (index >= length_) {
+return SP_NORMAL;
+}
+return script_pos_[index];
+}
+inline float rating() const {
+return rating_;
+}
+inline float certainty() const {
+return certainty_;
+}
+inline float certainty(unsigned index) const {
+return certainties_[index];
+}
+inline float min_x_height() const {
+return min_x_height_;
+}
+inline float max_x_height() const {
+return max_x_height_;
+}
+inline void set_x_heights(float min_height, float max_height) {
+min_x_height_ = min_height;
+max_x_height_ = max_height;
+}
+inline uint8_t permuter() const {
+return permuter_;
+}
+const char *permuter_name() const;
+// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
+// taken from the appropriate cell in the ratings MATRIX.
+// Borrowed pointer, so do not delete.
+BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const;
+// Returns the MATRIX_COORD corresponding to the location in the ratings
+// MATRIX for the given index into the word.
+MATRIX_COORD MatrixCoord(unsigned index) const;
+inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) {
+assert(index < length_);
+unichar_ids_[index] = unichar_id;
+}
+bool dangerous_ambig_found() const {
+return dangerous_ambig_found_;
+}
+void set_dangerous_ambig_found_(bool value) {
+dangerous_ambig_found_ = value;
+}
+inline void set_rating(float new_val) {
+rating_ = new_val;
+}
+inline void set_certainty(float new_val) {
+certainty_ = new_val;
+}
+inline void set_permuter(uint8_t perm) {
+permuter_ = perm;
+}
+// Note: this function should only be used if all the fields
+// are populated manually with set_* functions (rather than
+// (copy)constructors and append_* functions).
+inline void set_length(unsigned len) {
+ASSERT_HOST(reserved_ >= len);
+length_ = len;
+}
+/// Make more space in unichar_id_ and fragment_lengths_ arrays.
+inline void double_the_size() {
+if (reserved_ > 0) {
+reserved_ *= 2;
+} else {
+reserved_ = 1;
+}
+unichar_ids_.resize(reserved_);
+script_pos_.resize(reserved_);
+state_.resize(reserved_);
+certainties_.resize(reserved_);
+}
+/// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
+/// fragment_length_ arrays. Sets other values to default (blank) values.
+inline void init(unsigned reserved) {
+reserved_ = reserved;
+if (reserved > 0) {
+unichar_ids_.resize(reserved);
+script_pos_.resize(reserved);
+state_.resize(reserved);
+certainties_.resize(reserved);
+} else {
+unichar_ids_.clear();
+script_pos_.clear();
+state_.clear();
+certainties_.clear();
+}
+length_ = 0;
+adjust_factor_ = 1.0f;
+rating_ = 0.0;
+certainty_ = FLT_MAX;
+min_x_height_ = 0.0f;
+max_x_height_ = FLT_MAX;
+permuter_ = NO_PERM;
+unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
+dangerous_ambig_found_ = false;
+}
+/// Helper function to build a WERD_CHOICE from the given string,
+/// fragment lengths, rating, certainty and permuter.
+/// The function assumes that src_string is not nullptr.
+/// src_lengths argument could be nullptr, in which case the unichars
+/// in src_string are assumed to all be of length 1.
+void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty,
+uint8_t src_permuter);
+/// Set the fields in this choice to be default (bad) values.
+inline void make_bad() {
+length_ = 0;
+rating_ = kBadRating;
+certainty_ = -FLT_MAX;
+}
+/// This function assumes that there is enough space reserved
+/// in the WERD_CHOICE for adding another unichar.
+/// This is an efficient alternative to append_unichar_id().
+inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating,
+float certainty) {
+assert(reserved_ > length_);
+length_++;
+this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);
+}
+void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);
+inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,
+unsigned index) {
+assert(index < length_);
+unichar_ids_[index] = unichar_id;
+state_[index] = blob_count;
+certainties_[index] = certainty;
+script_pos_[index] = SP_NORMAL;
+rating_ += rating;
+if (certainty < certainty_) {
+certainty_ = certainty;
+}
+}
+// Sets the entries for the given index from the BLOB_CHOICE, assuming
+// unit fragment lengths, but setting the state for this index to blob_count.
+void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice);
+bool contains_unichar_id(UNICHAR_ID unichar_id) const;
+void remove_unichar_ids(unsigned index, int num);
+inline void remove_last_unichar_id() {
+--length_;
+}
+inline void remove_unichar_id(unsigned index) {
+this->remove_unichar_ids(index, 1);
+}
+bool has_rtl_unichar_id() const;
+void reverse_and_mirror_unichar_ids();
+// Returns the half-open interval of unichar_id indices [start, end) which
+// enclose the core portion of this word -- the part after stripping
+// punctuation from the left and right.
+void punct_stripped(unsigned *start_core, unsigned *end_core) const;
+// Returns the indices [start, end) containing the core of the word, stripped
+// of any superscript digits on either side. (i.e., the non-footnote part
+// of the word). There is no guarantee that the output range is non-empty.
+void GetNonSuperscriptSpan(int *start, int *end) const;
+// Return a copy of this WERD_CHOICE with the choices [start, end).
+// The result is useful only for checking against a dictionary.
+WERD_CHOICE shallow_copy(unsigned start, unsigned end) const;
+void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;
+std::string debug_string() const {
+std::string word_str;
+for (unsigned i = 0; i < length_; ++i) {
+word_str += unicharset_->debug_str(unichar_ids_[i]);
+word_str += " ";
+}
+return word_str;
+}
+// Returns true if any unichar_id in the word is a non-space-delimited char.
+bool ContainsAnyNonSpaceDelimited() const {
+for (unsigned i = 0; i < length_; ++i) {
+if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
+return true;
+}
+}
+return false;
+}
+// Returns true if the word is all spaces.
+bool IsAllSpaces() const {
+for (unsigned i = 0; i < length_; ++i) {
+if (unichar_ids_[i] != UNICHAR_SPACE) {
+return false;
+}
+}
+return true;
+}
+// Call this to override the default (strict left to right graphemes)
+// with the fact that some engine produces a "reading order" set of
+// Graphemes for each word.
+bool set_unichars_in_script_order(bool in_script_order) {
+return unichars_in_script_order_ = in_script_order;
+}
+bool unichars_in_script_order() const {
+return unichars_in_script_order_;
+}
+// Returns a UTF-8 string equivalent to the current choice
+// of UNICHAR IDs.
+std::string &unichar_string() {
+this->string_and_lengths(&unichar_string_, &unichar_lengths_);
+return unichar_string_;
+}
+// Returns a UTF-8 string equivalent to the current choice
+// of UNICHAR IDs.
+const std::string &unichar_string() const {
+this->string_and_lengths(&unichar_string_, &unichar_lengths_);
+return unichar_string_;
+}
+// Returns the lengths, one byte each, representing the number of bytes
+// required in the unichar_string for each UNICHAR_ID.
+const std::string &unichar_lengths() const {
+this->string_and_lengths(&unichar_string_, &unichar_lengths_);
+return unichar_lengths_;
+}
+// Sets up the script_pos_ member using the blobs_list to get the bln
+// bounding boxes, *this to get the unichars, and this->unicharset
+// to get the target positions. If small_caps is true, sub/super are not
+// considered, but dropcaps are.
+// NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.)
+void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0);
+// Sets all the script_pos_ positions to the given position.
+void SetAllScriptPositions(ScriptPos position);
+static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
+const TBOX &blob_box, UNICHAR_ID unichar_id);
+// Returns the "dominant" script ID for the word.  By "dominant", the script
+// must account for at least half the characters.  Otherwise, it returns 0.
+// Note that for Japanese, Hiragana and Katakana are simply treated as Han.
+int GetTopScriptID() const;
+// Fixes the state_ for a chop at the given blob_posiiton.
+void UpdateStateForSplit(int blob_position);
+// Returns the sum of all the state elements, being the total number of blobs.
+unsigned TotalOfStates() const;
+void print() const {
+this->print("");
+}
+void print(const char *msg) const;
+// Prints the segmentation state with an introductory message.
+void print_state(const char *msg) const;
+// Displays the segmentation state of *this (if not the same as the last
+// one displayed) and waits for a click in the window.
+void DisplaySegmentation(TWERD *word);
+WERD_CHOICE &operator+=(        // concatanate
+const WERD_CHOICE &second); // second on first
+WERD_CHOICE &operator=(const WERD_CHOICE &source);
+private:
+const UNICHARSET *unicharset_;
+// TODO(rays) Perhaps replace the multiple arrays with an array of structs?
+// unichar_ids_ is an array of classifier "results" that make up a word.
+// For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
+// of each unichar_id.
+// state_[i] indicates the number of blobs in WERD_RES::chopped_word that
+// were put together to make the classification results in the ith position
+// in unichar_ids_, and certainties_[i] is the certainty of the choice that
+// was used in this word.
+// == Change from before ==
+// Previously there was fragment_lengths_ that allowed a word to be
+// artificially composed of multiple fragment results. Since the new
+// segmentation search doesn't do fragments, treatment of fragments has
+// been moved to a lower level, augmenting the ratings matrix with the
+// combined fragments, and allowing the language-model/segmentation-search
+// to deal with only the combined unichar_ids.
+std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word
+std::vector<ScriptPos> script_pos_;   // Normal/Sub/Superscript of each unichar.
+std::vector<int> state_;              // Number of blobs in each unichar.
+std::vector<float> certainties_;      // Certainty of each unichar.
+unsigned reserved_;            // size of the above arrays
+unsigned length_;              // word length
+// Factor that was used to adjust the rating.
+float adjust_factor_;
+// Rating is the sum of the ratings of the individual blobs in the word.
+float rating_; // size related
+// certainty is the min (worst) certainty of the individual blobs in the word.
+float certainty_; // absolute
+// xheight computed from the result, or 0 if inconsistent.
+float min_x_height_;
+float max_x_height_;
+uint8_t permuter_; // permuter code
+// Normally, the ratings_ matrix represents the recognition results in order
+// from left-to-right.  However, some engines (say Cube) may return
+// recognition results in the order of the script's major reading direction
+// (for Arabic, that is right-to-left).
+bool unichars_in_script_order_;
+// True if NoDangerousAmbig found an ambiguity.
+bool dangerous_ambig_found_;
+// The following variables are populated and passed by reference any
+// time unichar_string() or unichar_lengths() are called.
+mutable std::string unichar_string_;
+mutable std::string unichar_lengths_;
+};
+// Make WERD_CHOICE listable.
+ELISTIZEH(WERD_CHOICE)
+using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>;
+// Utilities for comparing WERD_CHOICEs
+bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2);
+// Utilities for debug printing.
+void print_ratings_list(const char *msg,                     // intro message
+BLOB_CHOICE_LIST *ratings,           // list of results
+const UNICHARSET &current_unicharset // unicharset that can be used
+// for id-to-unichar conversion
+);
+} // namespace tesseract
+#endif

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.h @ 3:2c135c81b16c