Mercurial > hgrepos > Python2 > PyMuPDF

/**********************************************************************
 * File:        ratngs.h  (Formerly ratings.h)
 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
 * Author:      Ray Smith
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#ifndef RATNGS_H
#define RATNGS_H

#ifdef HAVE_CONFIG_H
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
#endif

#include "clst.h"
#include "elst.h"
#ifndef DISABLED_LEGACY_ENGINE
#  include "fontinfo.h"
#endif // undef DISABLED_LEGACY_ENGINE
#include "matrix.h"
#include "unicharset.h"
#include "werd.h"

#include <tesseract/unichar.h>

#include <cassert>
#include <cfloat> // for FLT_MAX

namespace tesseract {

class MATRIX;
struct TBLOB;
struct TWERD;

// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
// whether a blob has been classified by inspecting the BLOB_CHOICEs.
enum BlobChoiceClassifier {
  BCC_STATIC_CLASSIFIER,  // From the char_norm classifier.
  BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
  BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
  BCC_AMBIG,              // Generated by ambiguity detection.
  BCC_FAKE,               // From some other process.
};

class BLOB_CHOICE : public ELIST_LINK {
public:
  BLOB_CHOICE() {
    unichar_id_ = UNICHAR_SPACE;
    fontinfo_id_ = -1;
    fontinfo_id2_ = -1;
    rating_ = 10.0f;
    certainty_ = -1.0f;
    script_id_ = -1;
    min_xheight_ = 0.0f;
    max_xheight_ = 0.0f;
    yshift_ = 0.0f;
    classifier_ = BCC_FAKE;
  }
  BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
              float src_rating,          // rating
              float src_cert,            // certainty
              int script_id,             // script
              float min_xheight,         // min xheight in image pixel units
              float max_xheight,         // max xheight allowed by this char
              float yshift,              // the larger of y shift (top or bottom)
              BlobChoiceClassifier c);   // adapted match or other
  BLOB_CHOICE(const BLOB_CHOICE &other);
  ~BLOB_CHOICE() = default;

  UNICHAR_ID unichar_id() const {
    return unichar_id_;
  }
  float rating() const {
    return rating_;
  }
  float certainty() const {
    return certainty_;
  }
  int16_t fontinfo_id() const {
    return fontinfo_id_;
  }
  int16_t fontinfo_id2() const {
    return fontinfo_id2_;
  }
#ifndef DISABLED_LEGACY_ENGINE
  const std::vector<ScoredFont> &fonts() const {
    return fonts_;
  }
  void set_fonts(const std::vector<ScoredFont> &fonts) {
    fonts_ = fonts;
    int score1 = 0, score2 = 0;
    fontinfo_id_ = -1;
    fontinfo_id2_ = -1;
    for (auto &f : fonts_) {
      if (f.score > score1) {
        score2 = score1;
        fontinfo_id2_ = fontinfo_id_;
        score1 = f.score;
        fontinfo_id_ = f.fontinfo_id;
      } else if (f.score > score2) {
        score2 = f.score;
        fontinfo_id2_ = f.fontinfo_id;
      }
    }
  }
#endif // ndef DISABLED_LEGACY_ENGINE
  int script_id() const {
    return script_id_;
  }
  const MATRIX_COORD &matrix_cell() {
    return matrix_cell_;
  }
  float min_xheight() const {
    return min_xheight_;
  }
  float max_xheight() const {
    return max_xheight_;
  }
  float yshift() const {
    return yshift_;
  }
  BlobChoiceClassifier classifier() const {
    return classifier_;
  }
  bool IsAdapted() const {
    return classifier_ == BCC_ADAPTED_CLASSIFIER;
  }
  bool IsClassified() const {
    return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER ||
           classifier_ == BCC_SPECKLE_CLASSIFIER;
  }

  void set_unichar_id(UNICHAR_ID newunichar_id) {
    unichar_id_ = newunichar_id;
  }
  void set_rating(float newrat) {
    rating_ = newrat;
  }
  void set_certainty(float newrat) {
    certainty_ = newrat;
  }
  void set_script(int newscript_id) {
    script_id_ = newscript_id;
  }
  void set_matrix_cell(int col, int row) {
    matrix_cell_.col = col;
    matrix_cell_.row = row;
  }
  void set_classifier(BlobChoiceClassifier classifier) {
    classifier_ = classifier;
  }
  static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) {
    auto *choice = new BLOB_CHOICE;
    *choice = *src;
    return choice;
  }
  // Returns true if *this and other agree on the baseline and x-height
  // to within some tolerance based on a given estimate of the x-height.
  bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const;

  void print(const UNICHARSET *unicharset) const {
    tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
            static_cast<double>(rating_),
            static_cast<double>(certainty_),
            static_cast<double>(min_xheight_),
            static_cast<double>(max_xheight_),
            unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str());
  }
  void print_full() const {
    print(nullptr);
    tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_,
            fontinfo_id2_, static_cast<double>(yshift_), classifier_);
  }
  // Sort function for sorting BLOB_CHOICEs in increasing order of rating.
  static int SortByRating(const void *p1, const void *p2) {
    const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1);
    const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2);
    return (bc1->rating_ < bc2->rating_) ? -1 : 1;
  }

private:
  // Copy assignment operator.
  BLOB_CHOICE &operator=(const BLOB_CHOICE &other);

  UNICHAR_ID unichar_id_; // unichar id
#ifndef DISABLED_LEGACY_ENGINE
  // Fonts and scores. Allowed to be empty.
  std::vector<ScoredFont> fonts_;
#endif                   // ndef DISABLED_LEGACY_ENGINE
  int16_t fontinfo_id_;  // char font information
  int16_t fontinfo_id2_; // 2nd choice font information
  // Rating is the classifier distance weighted by the length of the outline
  // in the blob. In terms of probability, classifier distance is -klog p such
  // that the resulting distance is in the range [0, 1] and then
  // rating = w (-k log p) where w is the weight for the length of the outline.
  // Sums of ratings may be compared meaningfully for words of different
  // segmentation.
  float rating_; // size related
  // Certainty is a number in [-20, 0] indicating the classifier certainty
  // of the choice. In terms of probability, certainty = 20 (k log p) where
  // k is defined as above to normalize -klog p to the range [0, 1].
  float certainty_; // absolute
  int script_id_;
  // Holds the position of this choice in the ratings matrix.
  // Used to location position in the matrix during path backtracking.
  MATRIX_COORD matrix_cell_;
  // X-height range (in image pixels) that this classification supports.
  float min_xheight_;
  float max_xheight_;
  // yshift_ - The vertical distance (in image pixels) the character is
  //           shifted (up or down) from an acceptable y position.
  float yshift_;
  BlobChoiceClassifier classifier_; // What generated *this.
};

// Make BLOB_CHOICE listable.
ELISTIZEH(BLOB_CHOICE)

// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
// or nullptr if there is no match.
BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);

// Permuter codes used in WERD_CHOICEs.
enum PermuterType {
  NO_PERM,           // 0
  PUNC_PERM,         // 1
  TOP_CHOICE_PERM,   // 2
  LOWER_CASE_PERM,   // 3
  UPPER_CASE_PERM,   // 4
  NGRAM_PERM,        // 5
  NUMBER_PERM,       // 6
  USER_PATTERN_PERM, // 7
  SYSTEM_DAWG_PERM,  // 8
  DOC_DAWG_PERM,     // 9
  USER_DAWG_PERM,    // 10
  FREQ_DAWG_PERM,    // 11
  COMPOUND_PERM,     // 12

  NUM_PERMUTER_TYPES
};

// ScriptPos tells whether a character is subscript, superscript or normal.
enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP };

const char *ScriptPosToString(ScriptPos script_pos);

class TESS_API WERD_CHOICE : public ELIST_LINK {
public:
  static const float kBadRating;
  static const char *permuter_name(uint8_t permuter);

  WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) {
    this->init(8);
  }
  WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) {
    this->init(reserved);
  }
  WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating,
              float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
      : unicharset_(&unicharset) {
    this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);
  }
  WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
  WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) {
    this->init(word.length());
    this->operator=(word);
  }
  ~WERD_CHOICE();

  const UNICHARSET *unicharset() const {
    return unicharset_;
  }
  bool empty() const {
    return length_ == 0;
  }
  inline unsigned length() const {
    return length_;
  }
  float adjust_factor() const {
    return adjust_factor_;
  }
  void set_adjust_factor(float factor) {
    adjust_factor_ = factor;
  }
  inline const std::vector<UNICHAR_ID> &unichar_ids() const {
    return unichar_ids_;
  }
  inline UNICHAR_ID unichar_id(unsigned index) const {
    assert(index < length_);
    return unichar_ids_[index];
  }
  inline unsigned state(unsigned index) const {
    return state_[index];
  }
  ScriptPos BlobPosition(unsigned index) const {
    if (index >= length_) {
      return SP_NORMAL;
    }
    return script_pos_[index];
  }
  inline float rating() const {
    return rating_;
  }
  inline float certainty() const {
    return certainty_;
  }
  inline float certainty(unsigned index) const {
    return certainties_[index];
  }
  inline float min_x_height() const {
    return min_x_height_;
  }
  inline float max_x_height() const {
    return max_x_height_;
  }
  inline void set_x_heights(float min_height, float max_height) {
    min_x_height_ = min_height;
    max_x_height_ = max_height;
  }
  inline uint8_t permuter() const {
    return permuter_;
  }
  const char *permuter_name() const;
  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
  // taken from the appropriate cell in the ratings MATRIX.
  // Borrowed pointer, so do not delete.
  BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const;

  // Returns the MATRIX_COORD corresponding to the location in the ratings
  // MATRIX for the given index into the word.
  MATRIX_COORD MatrixCoord(unsigned index) const;

  inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) {
    assert(index < length_);
    unichar_ids_[index] = unichar_id;
  }
  bool dangerous_ambig_found() const {
    return dangerous_ambig_found_;
  }
  void set_dangerous_ambig_found_(bool value) {
    dangerous_ambig_found_ = value;
  }
  inline void set_rating(float new_val) {
    rating_ = new_val;
  }
  inline void set_certainty(float new_val) {
    certainty_ = new_val;
  }
  inline void set_permuter(uint8_t perm) {
    permuter_ = perm;
  }
  // Note: this function should only be used if all the fields
  // are populated manually with set_* functions (rather than
  // (copy)constructors and append_* functions).
  inline void set_length(unsigned len) {
    ASSERT_HOST(reserved_ >= len);
    length_ = len;
  }

  /// Make more space in unichar_id_ and fragment_lengths_ arrays.
  inline void double_the_size() {
    if (reserved_ > 0) {
      reserved_ *= 2;
    } else {
      reserved_ = 1;
    }
    unichar_ids_.resize(reserved_);
    script_pos_.resize(reserved_);
    state_.resize(reserved_);
    certainties_.resize(reserved_);
  }

  /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
  /// fragment_length_ arrays. Sets other values to default (blank) values.
  inline void init(unsigned reserved) {
    reserved_ = reserved;
    if (reserved > 0) {
      unichar_ids_.resize(reserved);
      script_pos_.resize(reserved);
      state_.resize(reserved);
      certainties_.resize(reserved);
    } else {
      unichar_ids_.clear();
      script_pos_.clear();
      state_.clear();
      certainties_.clear();
    }
    length_ = 0;
    adjust_factor_ = 1.0f;
    rating_ = 0.0;
    certainty_ = FLT_MAX;
    min_x_height_ = 0.0f;
    max_x_height_ = FLT_MAX;
    permuter_ = NO_PERM;
    unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
    dangerous_ambig_found_ = false;
  }

  /// Helper function to build a WERD_CHOICE from the given string,
  /// fragment lengths, rating, certainty and permuter.
  /// The function assumes that src_string is not nullptr.
  /// src_lengths argument could be nullptr, in which case the unichars
  /// in src_string are assumed to all be of length 1.
  void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty,
            uint8_t src_permuter);

  /// Set the fields in this choice to be default (bad) values.
  inline void make_bad() {
    length_ = 0;
    rating_ = kBadRating;
    certainty_ = -FLT_MAX;
  }

  /// This function assumes that there is enough space reserved
  /// in the WERD_CHOICE for adding another unichar.
  /// This is an efficient alternative to append_unichar_id().
  inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating,
                                                float certainty) {
    assert(reserved_ > length_);
    length_++;
    this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);
  }

  void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);

  inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,
                             unsigned index) {
    assert(index < length_);
    unichar_ids_[index] = unichar_id;
    state_[index] = blob_count;
    certainties_[index] = certainty;
    script_pos_[index] = SP_NORMAL;
    rating_ += rating;
    if (certainty < certainty_) {
      certainty_ = certainty;
    }
  }
  // Sets the entries for the given index from the BLOB_CHOICE, assuming
  // unit fragment lengths, but setting the state for this index to blob_count.
  void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice);

  bool contains_unichar_id(UNICHAR_ID unichar_id) const;
  void remove_unichar_ids(unsigned index, int num);
  inline void remove_last_unichar_id() {
    --length_;
  }
  inline void remove_unichar_id(unsigned index) {
    this->remove_unichar_ids(index, 1);
  }
  bool has_rtl_unichar_id() const;
  void reverse_and_mirror_unichar_ids();

  // Returns the half-open interval of unichar_id indices [start, end) which
  // enclose the core portion of this word -- the part after stripping
  // punctuation from the left and right.
  void punct_stripped(unsigned *start_core, unsigned *end_core) const;

  // Returns the indices [start, end) containing the core of the word, stripped
  // of any superscript digits on either side. (i.e., the non-footnote part
  // of the word). There is no guarantee that the output range is non-empty.
  void GetNonSuperscriptSpan(int *start, int *end) const;

  // Return a copy of this WERD_CHOICE with the choices [start, end).
  // The result is useful only for checking against a dictionary.
  WERD_CHOICE shallow_copy(unsigned start, unsigned end) const;

  void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;
  std::string debug_string() const {
    std::string word_str;
    for (unsigned i = 0; i < length_; ++i) {
      word_str += unicharset_->debug_str(unichar_ids_[i]);
      word_str += " ";
    }
    return word_str;
  }
  // Returns true if any unichar_id in the word is a non-space-delimited char.
  bool ContainsAnyNonSpaceDelimited() const {
    for (unsigned i = 0; i < length_; ++i) {
      if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
        return true;
      }
    }
    return false;
  }
  // Returns true if the word is all spaces.
  bool IsAllSpaces() const {
    for (unsigned i = 0; i < length_; ++i) {
      if (unichar_ids_[i] != UNICHAR_SPACE) {
        return false;
      }
    }
    return true;
  }

  // Call this to override the default (strict left to right graphemes)
  // with the fact that some engine produces a "reading order" set of
  // Graphemes for each word.
  bool set_unichars_in_script_order(bool in_script_order) {
    return unichars_in_script_order_ = in_script_order;
  }

  bool unichars_in_script_order() const {
    return unichars_in_script_order_;
  }

  // Returns a UTF-8 string equivalent to the current choice
  // of UNICHAR IDs.
  std::string &unichar_string() {
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
    return unichar_string_;
  }

  // Returns a UTF-8 string equivalent to the current choice
  // of UNICHAR IDs.
  const std::string &unichar_string() const {
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
    return unichar_string_;
  }

  // Returns the lengths, one byte each, representing the number of bytes
  // required in the unichar_string for each UNICHAR_ID.
  const std::string &unichar_lengths() const {
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
    return unichar_lengths_;
  }

  // Sets up the script_pos_ member using the blobs_list to get the bln
  // bounding boxes, *this to get the unichars, and this->unicharset
  // to get the target positions. If small_caps is true, sub/super are not
  // considered, but dropcaps are.
  // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.)
  void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0);
  // Sets all the script_pos_ positions to the given position.
  void SetAllScriptPositions(ScriptPos position);

  static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
                                    const TBOX &blob_box, UNICHAR_ID unichar_id);

  // Returns the "dominant" script ID for the word.  By "dominant", the script
  // must account for at least half the characters.  Otherwise, it returns 0.
  // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
  int GetTopScriptID() const;

  // Fixes the state_ for a chop at the given blob_posiiton.
  void UpdateStateForSplit(int blob_position);

  // Returns the sum of all the state elements, being the total number of blobs.
  unsigned TotalOfStates() const;

  void print() const {
    this->print("");
  }
  void print(const char *msg) const;
  // Prints the segmentation state with an introductory message.
  void print_state(const char *msg) const;

  // Displays the segmentation state of *this (if not the same as the last
  // one displayed) and waits for a click in the window.
  void DisplaySegmentation(TWERD *word);

  WERD_CHOICE &operator+=(        // concatanate
      const WERD_CHOICE &second); // second on first

  WERD_CHOICE &operator=(const WERD_CHOICE &source);

private:
  const UNICHARSET *unicharset_;
  // TODO(rays) Perhaps replace the multiple arrays with an array of structs?
  // unichar_ids_ is an array of classifier "results" that make up a word.
  // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
  // of each unichar_id.
  // state_[i] indicates the number of blobs in WERD_RES::chopped_word that
  // were put together to make the classification results in the ith position
  // in unichar_ids_, and certainties_[i] is the certainty of the choice that
  // was used in this word.
  // == Change from before ==
  // Previously there was fragment_lengths_ that allowed a word to be
  // artificially composed of multiple fragment results. Since the new
  // segmentation search doesn't do fragments, treatment of fragments has
  // been moved to a lower level, augmenting the ratings matrix with the
  // combined fragments, and allowing the language-model/segmentation-search
  // to deal with only the combined unichar_ids.
  std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word
  std::vector<ScriptPos> script_pos_;   // Normal/Sub/Superscript of each unichar.
  std::vector<int> state_;              // Number of blobs in each unichar.
  std::vector<float> certainties_;      // Certainty of each unichar.
  unsigned reserved_;            // size of the above arrays
  unsigned length_;              // word length
  // Factor that was used to adjust the rating.
  float adjust_factor_;
  // Rating is the sum of the ratings of the individual blobs in the word.
  float rating_; // size related
  // certainty is the min (worst) certainty of the individual blobs in the word.
  float certainty_; // absolute
  // xheight computed from the result, or 0 if inconsistent.
  float min_x_height_;
  float max_x_height_;
  uint8_t permuter_; // permuter code

  // Normally, the ratings_ matrix represents the recognition results in order
  // from left-to-right.  However, some engines (say Cube) may return
  // recognition results in the order of the script's major reading direction
  // (for Arabic, that is right-to-left).
  bool unichars_in_script_order_;
  // True if NoDangerousAmbig found an ambiguity.
  bool dangerous_ambig_found_;

  // The following variables are populated and passed by reference any
  // time unichar_string() or unichar_lengths() are called.
  mutable std::string unichar_string_;
  mutable std::string unichar_lengths_;
};

// Make WERD_CHOICE listable.
ELISTIZEH(WERD_CHOICE)
using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>;

// Utilities for comparing WERD_CHOICEs

bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2);

// Utilities for debug printing.
void print_ratings_list(const char *msg,                     // intro message
                        BLOB_CHOICE_LIST *ratings,           // list of results
                        const UNICHARSET &current_unicharset // unicharset that can be used
                                                             // for id-to-unichar conversion
);

} // namespace tesseract

#endif
author	Franz Glasner <fzglas.hg@dom66.de>
date	Fri, 19 Sep 2025 10:28:53 +0200
parents	b50eed0cc0ef
children