Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/ratngs.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,825 @@ +/********************************************************************** + * File: ratngs.cpp (Formerly ratings.c) + * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes. + * Author: Ray Smith + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "ratngs.h" + +#include "blobs.h" +#include "matrix.h" +#include "normalis.h" // kBlnBaselineOffset. +#include "unicharset.h" + +#include <algorithm> +#include <cmath> +#include <string> +#include <vector> + +namespace tesseract { + +const float WERD_CHOICE::kBadRating = 100000.0; +// Min offset in baseline-normalized coords to make a character a subscript. +const int kMinSubscriptOffset = 20; +// Min offset in baseline-normalized coords to make a character a superscript. +const int kMinSuperscriptOffset = 20; +// Max y of bottom of a drop-cap blob. +const int kMaxDropCapBottom = -128; +// Max fraction of x-height to use as denominator in measuring x-height overlap. +const double kMaxOverlapDenominator = 0.125; +// Min fraction of x-height range that should be in agreement for matching +// x-heights. +const double kMinXHeightMatch = 0.5; +// Max tolerance on baseline position as a fraction of x-height for matching +// baselines. +const double kMaxBaselineDrift = 0.0625; + +static const char kPermuterTypeNoPerm[] = "None"; +static const char kPermuterTypePuncPerm[] = "Punctuation"; +static const char kPermuterTypeTopPerm[] = "Top Choice"; +static const char kPermuterTypeLowerPerm[] = "Top Lower Case"; +static const char kPermuterTypeUpperPerm[] = "Top Upper Case"; +static const char kPermuterTypeNgramPerm[] = "Ngram"; +static const char kPermuterTypeNumberPerm[] = "Number"; +static const char kPermuterTypeUserPatPerm[] = "User Pattern"; +static const char kPermuterTypeSysDawgPerm[] = "System Dictionary"; +static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary"; +static const char kPermuterTypeUserDawgPerm[] = "User Dictionary"; +static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary"; +static const char kPermuterTypeCompoundPerm[] = "Compound"; + +static const char *const kPermuterTypeNames[] = { + kPermuterTypeNoPerm, // 0 + kPermuterTypePuncPerm, // 1 + kPermuterTypeTopPerm, // 2 + kPermuterTypeLowerPerm, // 3 + kPermuterTypeUpperPerm, // 4 + kPermuterTypeNgramPerm, // 5 + kPermuterTypeNumberPerm, // 6 + kPermuterTypeUserPatPerm, // 7 + kPermuterTypeSysDawgPerm, // 8 + kPermuterTypeDocDawgPerm, // 9 + kPermuterTypeUserDawgPerm, // 10 + kPermuterTypeFreqDawgPerm, // 11 + kPermuterTypeCompoundPerm // 12 +}; + +/** + * BLOB_CHOICE::BLOB_CHOICE + * + * Constructor to build a BLOB_CHOICE from a char, rating and certainty. + */ +BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id + float src_rating, // rating + float src_cert, // certainty + int src_script_id, // script + float min_xheight, // min xheight allowed + float max_xheight, // max xheight by this char + float yshift, // yshift out of position + BlobChoiceClassifier c) { // adapted match or other + unichar_id_ = src_unichar_id; + rating_ = src_rating; + certainty_ = src_cert; + fontinfo_id_ = -1; + fontinfo_id2_ = -1; + script_id_ = src_script_id; + min_xheight_ = min_xheight; + max_xheight_ = max_xheight; + yshift_ = yshift; + classifier_ = c; +} + +/** + * BLOB_CHOICE::BLOB_CHOICE + * + * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. + */ +BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) { + unichar_id_ = other.unichar_id(); + rating_ = other.rating(); + certainty_ = other.certainty(); + fontinfo_id_ = other.fontinfo_id(); + fontinfo_id2_ = other.fontinfo_id2(); + script_id_ = other.script_id(); + matrix_cell_ = other.matrix_cell_; + min_xheight_ = other.min_xheight_; + max_xheight_ = other.max_xheight_; + yshift_ = other.yshift(); + classifier_ = other.classifier_; +#ifndef DISABLED_LEGACY_ENGINE + fonts_ = other.fonts_; +#endif // ndef DISABLED_LEGACY_ENGINE +} + +// Copy assignment operator. +BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) { + ELIST_LINK::operator=(other); + unichar_id_ = other.unichar_id(); + rating_ = other.rating(); + certainty_ = other.certainty(); + fontinfo_id_ = other.fontinfo_id(); + fontinfo_id2_ = other.fontinfo_id2(); + script_id_ = other.script_id(); + matrix_cell_ = other.matrix_cell_; + min_xheight_ = other.min_xheight_; + max_xheight_ = other.max_xheight_; + yshift_ = other.yshift(); + classifier_ = other.classifier_; +#ifndef DISABLED_LEGACY_ENGINE + fonts_ = other.fonts_; +#endif // ndef DISABLED_LEGACY_ENGINE + return *this; +} + +// Returns true if *this and other agree on the baseline and x-height +// to within some tolerance based on a given estimate of the x-height. +bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const { + double baseline_diff = std::fabs(yshift() - other.yshift()); + if (baseline_diff > kMaxBaselineDrift * x_height) { + if (debug) { + tprintf("Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_); + } + return false; + } + double this_range = max_xheight() - min_xheight(); + double other_range = other.max_xheight() - other.min_xheight(); + double denominator = + ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height); + double overlap = + std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight()); + overlap /= denominator; + if (debug) { + tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_, + other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap); + } + + return overlap >= kMinXHeightMatch; +} + +// Helper to find the BLOB_CHOICE in the bc_list that matches the given +// unichar_id, or nullptr if there is no match. +BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) { + // Find the corresponding best BLOB_CHOICE. + BLOB_CHOICE_IT choice_it(bc_list); + for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { + BLOB_CHOICE *choice = choice_it.data(); + if (choice->unichar_id() == char_id) { + return choice; + } + } + return nullptr; +} + +const char *WERD_CHOICE::permuter_name(uint8_t permuter) { + return kPermuterTypeNames[permuter]; +} + +const char *ScriptPosToString(enum ScriptPos script_pos) { + switch (script_pos) { + case SP_NORMAL: + return "NORM"; + case SP_SUBSCRIPT: + return "SUB"; + case SP_SUPERSCRIPT: + return "SUPER"; + case SP_DROPCAP: + return "DROPC"; + } + return "SP_UNKNOWN"; +} + +/** + * WERD_CHOICE::WERD_CHOICE + * + * Constructor to build a WERD_CHOICE from the given string. + * The function assumes that src_string is not nullptr. + */ +WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) + : unicharset_(&unicharset) { + std::vector<UNICHAR_ID> encoding; + std::vector<char> lengths; + std::string cleaned = unicharset.CleanupString(src_string); + if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) { + lengths.push_back('\0'); + std::string src_lengths = &lengths[0]; + this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM); + } else { // There must have been an invalid unichar in the string. + this->init(8); + this->make_bad(); + } +} + +/** + * WERD_CHOICE::init + * + * Helper function to build a WERD_CHOICE from the given string, + * fragment lengths, rating, certainty and permuter. + * + * The function assumes that src_string is not nullptr. + * src_lengths argument could be nullptr, in which case the unichars + * in src_string are assumed to all be of length 1. + */ +void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating, + float src_certainty, uint8_t src_permuter) { + int src_string_len = strlen(src_string); + if (src_string_len == 0) { + this->init(8); + } else { + this->init(src_lengths ? strlen(src_lengths) : src_string_len); + length_ = reserved_; + int offset = 0; + for (unsigned i = 0; i < length_; ++i) { + int unichar_length = src_lengths ? src_lengths[i] : 1; + unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length); + state_[i] = 1; + certainties_[i] = src_certainty; + offset += unichar_length; + } + } + adjust_factor_ = 1.0f; + rating_ = src_rating; + certainty_ = src_certainty; + permuter_ = src_permuter; + dangerous_ambig_found_ = false; +} + +/** + * WERD_CHOICE::~WERD_CHOICE + */ +WERD_CHOICE::~WERD_CHOICE() = default; + +const char *WERD_CHOICE::permuter_name() const { + return kPermuterTypeNames[permuter_]; +} + +// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, +// taken from the appropriate cell in the ratings MATRIX. +// Borrowed pointer, so do not delete. +BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const { + MATRIX_COORD coord = MatrixCoord(index); + BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row); + if (result == nullptr) { + result = new BLOB_CHOICE_LIST; + ratings->put(coord.col, coord.row, result); + } + return result; +} + +// Returns the MATRIX_COORD corresponding to the location in the ratings +// MATRIX for the given index into the word. +MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const { + int col = 0; + for (unsigned i = 0; i < index; ++i) { + col += state_[i]; + } + int row = col + state_[index] - 1; + return MATRIX_COORD(col, row); +} + +// Sets the entries for the given index from the BLOB_CHOICE, assuming +// unit fragment lengths, but setting the state for this index to blob_count. +void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) { + unichar_ids_[index] = blob_choice->unichar_id(); + script_pos_[index] = tesseract::SP_NORMAL; + state_[index] = blob_count; + certainties_[index] = blob_choice->certainty(); +} + +/** + * contains_unichar_id + * + * Returns true if unichar_ids_ contain the given unichar_id, false otherwise. + */ +bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const { + for (unsigned i = 0; i < length_; ++i) { + if (unichar_ids_[i] == unichar_id) { + return true; + } + } + return false; +} + +/** + * remove_unichar_ids + * + * Removes num unichar ids starting from index start from unichar_ids_ + * and updates length_ and fragment_lengths_ to reflect this change. + * Note: this function does not modify rating_ and certainty_. + */ +void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) { + ASSERT_HOST(start + num <= length_); + // Accumulate the states to account for the merged blobs. + for (int i = 0; i < num; ++i) { + if (start > 0) { + state_[start - 1] += state_[start + i]; + } else if (start + num < length_) { + state_[start + num] += state_[start + i]; + } + } + for (unsigned i = start; i + num < length_; ++i) { + unichar_ids_[i] = unichar_ids_[i + num]; + script_pos_[i] = script_pos_[i + num]; + state_[i] = state_[i + num]; + certainties_[i] = certainties_[i + num]; + } + length_ -= num; +} + +/** + * reverse_and_mirror_unichar_ids + * + * Reverses and mirrors unichars in unichar_ids. + */ +void WERD_CHOICE::reverse_and_mirror_unichar_ids() { + for (unsigned i = 0; i < length_ / 2; ++i) { + UNICHAR_ID tmp_id = unichar_ids_[i]; + unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]); + unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id); + } + if (length_ % 2 != 0) { + unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]); + } +} + +/** + * punct_stripped + * + * Returns the half-open interval of unichar_id indices [start, end) which + * enclose the core portion of this word -- the part after stripping + * punctuation from the left and right. + */ +void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const { + *start = 0; + *end = length(); + while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) { + (*start)++; + } + while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) { + (*end)--; + } +} + +void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const { + int end = length(); + while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) && + BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { + end--; + } + int start = 0; + while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) && + BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { + start++; + } + *pstart = start; + *pend = end; +} + +WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const { + ASSERT_HOST(start <= length_); + ASSERT_HOST(end <= length_); + if (end < start) { + end = start; + } + WERD_CHOICE retval(unicharset_, end - start); + for (auto i = start; i < end; i++) { + retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]); + } + return retval; +} + +/** + * has_rtl_unichar_id + * + * Returns true if unichar_ids contain at least one "strongly" RTL unichar. + */ +bool WERD_CHOICE::has_rtl_unichar_id() const { + for (unsigned i = 0; i < length_; ++i) { + UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]); + if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { + return true; + } + } + return false; +} + +/** + * string_and_lengths + * + * Populates the given word_str with unichars from unichar_ids and + * and word_lengths_str with the corresponding unichar lengths. + */ +void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const { + *word_str = ""; + if (word_lengths_str != nullptr) { + *word_lengths_str = ""; + } + for (unsigned i = 0; i < length_; ++i) { + const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]); + *word_str += ch; + if (word_lengths_str != nullptr) { + *word_lengths_str += (char)strlen(ch); + } + } +} + +/** + * append_unichar_id + * + * Make sure there is enough space in the word for the new unichar id + * and call append_unichar_id_space_allocated(). + */ +void WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, + float certainty) { + if (length_ == reserved_) { + this->double_the_size(); + } + this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty); +} + +/** + * WERD_CHOICE::operator+= + * + * Cat a second word rating on the end of this current one. + * The ratings are added and the confidence is the min. + * If the permuters are NOT the same the permuter is set to COMPOUND_PERM + */ +WERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) { + ASSERT_HOST(unicharset_ == second.unicharset_); + while (reserved_ < length_ + second.length()) { + this->double_the_size(); + } + const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids(); + for (unsigned i = 0; i < second.length(); ++i) { + unichar_ids_[length_ + i] = other_unichar_ids[i]; + state_[length_ + i] = second.state_[i]; + certainties_[length_ + i] = second.certainties_[i]; + script_pos_[length_ + i] = second.BlobPosition(i); + } + length_ += second.length(); + if (second.adjust_factor_ > adjust_factor_) { + adjust_factor_ = second.adjust_factor_; + } + rating_ += second.rating(); // add ratings + if (second.certainty() < certainty_) { // take min + certainty_ = second.certainty(); + } + if (second.dangerous_ambig_found_) { + dangerous_ambig_found_ = true; + } + if (permuter_ == NO_PERM) { + permuter_ = second.permuter(); + } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) { + permuter_ = COMPOUND_PERM; + } + return *this; +} + +/** + * WERD_CHOICE::operator= + * + * Allocate enough memory to hold a copy of source and copy over + * all the information from source to this WERD_CHOICE. + */ +WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) { + while (reserved_ < source.length()) { + this->double_the_size(); + } + + unicharset_ = source.unicharset_; + const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids(); + for (unsigned i = 0; i < source.length(); ++i) { + unichar_ids_[i] = other_unichar_ids[i]; + state_[i] = source.state_[i]; + certainties_[i] = source.certainties_[i]; + script_pos_[i] = source.BlobPosition(i); + } + length_ = source.length(); + adjust_factor_ = source.adjust_factor_; + rating_ = source.rating(); + certainty_ = source.certainty(); + min_x_height_ = source.min_x_height(); + max_x_height_ = source.max_x_height(); + permuter_ = source.permuter(); + dangerous_ambig_found_ = source.dangerous_ambig_found_; + return *this; +} + +// Sets up the script_pos_ member using the blobs_list to get the bln +// bounding boxes, *this to get the unichars, and this->unicharset +// to get the target positions. If small_caps is true, sub/super are not +// considered, but dropcaps are. +// NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.) +void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) { + // Initialize to normal. + for (unsigned i = 0; i < length_; ++i) { + script_pos_[i] = tesseract::SP_NORMAL; + } + if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) { + return; + } + + unsigned position_counts[4] = {0, 0, 0, 0}; + + int chunk_index = 0; + for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) { + TBLOB *tblob = word->blobs[chunk_index]; + int uni_id = unichar_id(blob_index); + TBOX blob_box = tblob->bounding_box(); + if (!state_.empty()) { + for (int i = 1; i < state_[blob_index]; ++i) { + ++chunk_index; + tblob = word->blobs[chunk_index]; + blob_box += tblob->bounding_box(); + } + } + script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id); + if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) { + script_pos_[blob_index] = tesseract::SP_NORMAL; + } + position_counts[script_pos_[blob_index]]++; + } + // If almost everything looks like a superscript or subscript, + // we most likely just got the baseline wrong. + if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ || + 4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) { + if (debug >= 2) { + tprintf( + "Most characters of %s are subscript or superscript.\n" + "That seems wrong, so I'll assume we got the baseline wrong\n", + unichar_string().c_str()); + } + for (unsigned i = 0; i < length_; i++) { + ScriptPos sp = script_pos_[i]; + if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) { + ASSERT_HOST(position_counts[sp] > 0); + position_counts[sp]--; + position_counts[tesseract::SP_NORMAL]++; + script_pos_[i] = tesseract::SP_NORMAL; + } + } + } + + if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) { + tprintf("SetScriptPosition on %s\n", unichar_string().c_str()); + int chunk_index = 0; + for (unsigned blob_index = 0; blob_index < length_; ++blob_index) { + if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) { + TBLOB *tblob = word->blobs[chunk_index]; + ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index)); + } + chunk_index += state_.empty() ? 1 : state_[blob_index]; + } + } +} + +// Sets all the script_pos_ positions to the given position. +void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) { + for (unsigned i = 0; i < length_; ++i) { + script_pos_[i] = position; + } +} + +/* static */ +ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, + const TBOX &blob_box, UNICHAR_ID unichar_id) { + ScriptPos retval = tesseract::SP_NORMAL; + int top = blob_box.top(); + int bottom = blob_box.bottom(); + int min_bottom, max_bottom, min_top, max_top; + unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top); + + int sub_thresh_top = min_top - kMinSubscriptOffset; + int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; + int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; + if (bottom <= kMaxDropCapBottom) { + retval = tesseract::SP_DROPCAP; + } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { + retval = tesseract::SP_SUBSCRIPT; + } else if (bottom > sup_thresh_bot) { + retval = tesseract::SP_SUPERSCRIPT; + } + + if (print_debug) { + const char *pos = ScriptPosToString(retval); + tprintf( + "%s Character %s[bot:%d top: %d] " + "bot_range[%d,%d] top_range[%d, %d] " + "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", + pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top, + max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot); + } + return retval; +} + +// Returns the script-id (eg Han) of the dominant script in the word. +int WERD_CHOICE::GetTopScriptID() const { + unsigned max_script = unicharset_->get_script_table_size(); + std::vector<unsigned> sid(max_script); + for (unsigned x = 0; x < length_; ++x) { + int script_id = unicharset_->get_script(unichar_id(x)); + sid[script_id]++; + } + if (unicharset_->han_sid() != unicharset_->null_sid()) { + // Add the Hiragana & Katakana counts to Han and zero them out. + if (unicharset_->hiragana_sid() != unicharset_->null_sid()) { + sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()]; + sid[unicharset_->hiragana_sid()] = 0; + } + if (unicharset_->katakana_sid() != unicharset_->null_sid()) { + sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()]; + sid[unicharset_->katakana_sid()] = 0; + } + } + // Note that high script ID overrides lower one on a tie, thus biasing + // towards non-Common script (if sorted that way in unicharset file). + unsigned max_sid = 0; + for (unsigned x = 1; x < max_script; x++) { + if (sid[x] >= sid[max_sid]) { + max_sid = x; + } + } + if (sid[max_sid] < length_ / 2) { + max_sid = unicharset_->null_sid(); + } + return max_sid; +} + +// Fixes the state_ for a chop at the given blob_posiiton. +void WERD_CHOICE::UpdateStateForSplit(int blob_position) { + int total_chunks = 0; + for (unsigned i = 0; i < length_; ++i) { + total_chunks += state_[i]; + if (total_chunks > blob_position) { + ++state_[i]; + return; + } + } +} + +// Returns the sum of all the state elements, being the total number of blobs. +unsigned WERD_CHOICE::TotalOfStates() const { + unsigned total_chunks = 0; + for (unsigned i = 0; i < length_; ++i) { + total_chunks += state_[i]; + } + return total_chunks; +} + +/** + * WERD_CHOICE::print + * + * Print WERD_CHOICE to stdout. + */ +void WERD_CHOICE::print(const char *msg) const { + tprintf("%s : ", msg); + for (unsigned i = 0; i < length_; ++i) { + tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i])); + } + tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_, + adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_); + tprintf("pos"); + for (unsigned i = 0; i < length_; ++i) { + tprintf("\t%s", ScriptPosToString(script_pos_[i])); + } + tprintf("\nstr"); + for (unsigned i = 0; i < length_; ++i) { + tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i])); + } + tprintf("\nstate:"); + for (unsigned i = 0; i < length_; ++i) { + tprintf("\t%d ", state_[i]); + } + tprintf("\nC"); + for (unsigned i = 0; i < length_; ++i) { + tprintf("\t%.3f", certainties_[i]); + } + tprintf("\n"); +} + +// Prints the segmentation state with an introductory message. +void WERD_CHOICE::print_state(const char *msg) const { + tprintf("%s", msg); + for (unsigned i = 0; i < length_; ++i) { + tprintf(" %d", state_[i]); + } + tprintf("\n"); +} + +#ifndef GRAPHICS_DISABLED + +// Displays the segmentation state of *this (if not the same as the last +// one displayed) and waits for a click in the window. +void WERD_CHOICE::DisplaySegmentation(TWERD *word) { + // Number of different colors to draw with. + const int kNumColors = 6; + static ScrollView *segm_window = nullptr; + // Check the state against the static prev_drawn_state. + static std::vector<int> prev_drawn_state; + bool already_done = prev_drawn_state.size() == length_; + if (!already_done) { + prev_drawn_state.clear(); + prev_drawn_state.resize(length_); + } + for (unsigned i = 0; i < length_; ++i) { + if (prev_drawn_state[i] != state_[i]) { + already_done = false; + } + prev_drawn_state[i] = state_[i]; + } + if (already_done || word->blobs.empty()) { + return; + } + + // Create the window if needed. + if (segm_window == nullptr) { + segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true); + } else { + segm_window->Clear(); + } + + TBOX bbox; + int blob_index = 0; + for (unsigned c = 0; c < length_; ++c) { + auto color = static_cast<ScrollView::Color>(c % kNumColors + 3); + for (int i = 0; i < state_[c]; ++i, ++blob_index) { + TBLOB *blob = word->blobs[blob_index]; + bbox += blob->bounding_box(); + blob->plot(segm_window, color, color); + } + } + segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom()); + segm_window->Update(); + segm_window->Wait(); +} + +#endif // !GRAPHICS_DISABLED + +bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) { + const UNICHARSET *uchset = word1.unicharset(); + if (word2.unicharset() != uchset) { + return false; + } + unsigned w1start, w1end; + word1.punct_stripped(&w1start, &w1end); + unsigned w2start, w2end; + word2.punct_stripped(&w2start, &w2end); + if (w1end - w1start != w2end - w2start) { + return false; + } + for (unsigned i = 0; i < w1end - w1start; i++) { + if (uchset->to_lower(word1.unichar_id(w1start + i)) != + uchset->to_lower(word2.unichar_id(w2start + i))) { + return false; + } + } + return true; +} + +/** + * print_ratings_list + * + * Send all the ratings out to the logfile. + * + * @param msg intro message + * @param ratings list of ratings + * @param current_unicharset unicharset that can be used + * for id-to-unichar conversion + */ +void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, + const UNICHARSET ¤t_unicharset) { + if (ratings->empty()) { + tprintf("%s:<none>\n", msg); + return; + } + if (*msg != '\0') { + tprintf("%s\n", msg); + } + BLOB_CHOICE_IT c_it; + c_it.set_to_list(ratings); + for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { + c_it.data()->print(¤t_unicharset); + if (!c_it.at_last()) { + tprintf("\n"); + } + } + tprintf("\n"); + fflush(stdout); +} + +} // namespace tesseract
