Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.h comparison

comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+///////////////////////////////////////////////////////////////////////
+// File:        unicharset.h
+// Description: Unicode character/ligature set class.
+// Author:      Thomas Kielbus
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+#ifndef TESSERACT_CCUTIL_UNICHARSET_H_
+#define TESSERACT_CCUTIL_UNICHARSET_H_
+#include "errcode.h"
+#include "unicharmap.h"
+#include <tesseract/unichar.h>
+#include "helpers.h"
+#include "serialis.h"
+#include <functional> // for std::function
+namespace tesseract {
+// Enum holding special values of unichar_id. Every unicharset has these.
+// Warning! Keep in sync with kSpecialUnicharCodes.
+enum SpecialUnicharCodes {
+UNICHAR_SPACE,
+UNICHAR_JOINED,
+UNICHAR_BROKEN,
+SPECIAL_UNICHAR_CODES_COUNT
+};
+// Boolean flag for unichar_insert. It's a bit of a double negative to allow
+// the default value to be false.
+enum class OldUncleanUnichars {
+kFalse,
+kTrue,
+};
+class TESS_API CHAR_FRAGMENT {
+public:
+// Minimum number of characters used for fragment representation.
+static const int kMinLen = 6;
+// Maximum number of characters used for fragment representation.
+static const int kMaxLen = 3 + UNICHAR_LEN + 2;
+// Maximum number of fragments per character.
+static const int kMaxChunks = 5;
+// Setters and Getters.
+inline void set_all(const char *unichar, int pos, int total, bool natural) {
+set_unichar(unichar);
+set_pos(pos);
+set_total(total);
+set_natural(natural);
+}
+inline void set_unichar(const char *uch) {
+strncpy(this->unichar, uch, sizeof(this->unichar));
+this->unichar[UNICHAR_LEN] = '\0';
+}
+inline void set_pos(int p) {
+this->pos = p;
+}
+inline void set_total(int t) {
+this->total = t;
+}
+inline const char *get_unichar() const {
+return this->unichar;
+}
+inline int get_pos() const {
+return this->pos;
+}
+inline int get_total() const {
+return this->total;
+}
+// Returns the string that represents a fragment
+// with the given unichar, pos and total.
+static std::string to_string(const char *unichar, int pos, int total,
+bool natural);
+// Returns the string that represents this fragment.
+std::string to_string() const {
+return to_string(unichar, pos, total, natural);
+}
+// Checks whether a fragment has the same unichar,
+// position and total as the given inputs.
+inline bool equals(const char *other_unichar, int other_pos,
+int other_total) const {
+return (strcmp(this->unichar, other_unichar) == 0 &&
+this->pos == other_pos && this->total == other_total);
+}
+inline bool equals(const CHAR_FRAGMENT *other) const {
+return this->equals(other->get_unichar(), other->get_pos(),
+other->get_total());
+}
+// Checks whether a given fragment is a continuation of this fragment.
+// Assumes that the given fragment pointer is not nullptr.
+inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
+return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
+this->total == fragment->get_total() &&
+this->pos == fragment->get_pos() + 1);
+}
+// Returns true if this fragment is a beginning fragment.
+inline bool is_beginning() const {
+return this->pos == 0;
+}
+// Returns true if this fragment is an ending fragment.
+inline bool is_ending() const {
+return this->pos == this->total - 1;
+}
+// Returns true if the fragment was a separate component to begin with,
+// ie did not need chopping to be isolated, but may have been separated
+// out from a multi-outline blob.
+inline bool is_natural() const {
+return natural;
+}
+void set_natural(bool value) {
+natural = value;
+}
+// Parses the string to see whether it represents a character fragment
+// (rather than a regular character). If so, allocates memory for a new
+// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
+// information. Fragments are of the form:
+// |m|1|2, meaning chunk 1 of 2 of character m, or
+// |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
+// to divide the parts, as they were already separate connected components.
+//
+// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
+// instance, otherwise (if the string does not represent a fragment or it
+// looks like it does, but parsing it as a fragment fails) returns nullptr.
+//
+// Note: The caller is responsible for deallocating memory
+// associated with the returned pointer.
+static CHAR_FRAGMENT *parse_from_string(const char *str);
+private:
+char unichar[UNICHAR_LEN + 1];
+// True if the fragment was a separate component to begin with,
+// ie did not need chopping to be isolated, but may have been separated
+// out from a multi-outline blob.
+bool natural;
+int16_t pos;   // fragment position in the character
+int16_t total; // total number of fragments in the character
+};
+// The UNICHARSET class is an utility class for Tesseract that holds the
+// set of characters that are used by the engine. Each character is identified
+// by a unique number, from 0 to (size - 1).
+class TESS_API UNICHARSET {
+public:
+// Custom list of characters and their ligature forms (UTF8)
+// These map to unicode values in the private use area (PUC) and are supported
+// by only few font families (eg. Wyld, Adobe Caslon Pro).
+static const char *kCustomLigatures[][2];
+// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
+static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
+// ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
+enum Direction {
+U_LEFT_TO_RIGHT = 0,
+U_RIGHT_TO_LEFT = 1,
+U_EUROPEAN_NUMBER = 2,
+U_EUROPEAN_NUMBER_SEPARATOR = 3,
+U_EUROPEAN_NUMBER_TERMINATOR = 4,
+U_ARABIC_NUMBER = 5,
+U_COMMON_NUMBER_SEPARATOR = 6,
+U_BLOCK_SEPARATOR = 7,
+U_SEGMENT_SEPARATOR = 8,
+U_WHITE_SPACE_NEUTRAL = 9,
+U_OTHER_NEUTRAL = 10,
+U_LEFT_TO_RIGHT_EMBEDDING = 11,
+U_LEFT_TO_RIGHT_OVERRIDE = 12,
+U_RIGHT_TO_LEFT_ARABIC = 13,
+U_RIGHT_TO_LEFT_EMBEDDING = 14,
+U_RIGHT_TO_LEFT_OVERRIDE = 15,
+U_POP_DIRECTIONAL_FORMAT = 16,
+U_DIR_NON_SPACING_MARK = 17,
+U_BOUNDARY_NEUTRAL = 18,
+U_FIRST_STRONG_ISOLATE = 19,
+U_LEFT_TO_RIGHT_ISOLATE = 20,
+U_RIGHT_TO_LEFT_ISOLATE = 21,
+U_POP_DIRECTIONAL_ISOLATE = 22,
+#ifndef U_HIDE_DEPRECATED_API
+U_CHAR_DIRECTION_COUNT
+#endif // U_HIDE_DEPRECATED_API
+};
+// Create an empty UNICHARSET
+UNICHARSET();
+~UNICHARSET();
+// Return the UNICHAR_ID of a given unichar representation within the
+// UNICHARSET.
+UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;
+// Return the UNICHAR_ID of a given unichar representation within the
+// UNICHARSET. Only the first length characters from unichar_repr are used.
+UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;
+// Return the minimum number of bytes that matches a legal UNICHAR_ID,
+// while leaving the rest of the string encodable. Returns 0 if the
+// beginning of the string is not encodable.
+// WARNING: this function now encodes the whole string for precision.
+// Use encode_string in preference to repeatedly calling step.
+int step(const char *str) const;
+// Returns true if the given UTF-8 string is encodable with this UNICHARSET.
+// If not encodable, write the first byte offset which cannot be converted
+// into the second (return) argument.
+bool encodable_string(const char *str, unsigned *first_bad_position) const;
+// Encodes the given UTF-8 string with this UNICHARSET.
+// Any part of the string that cannot be encoded (because the utf8 can't
+// be broken up into pieces that are in the unicharset) then:
+// if give_up_on_failure, stops and returns a partial encoding,
+// else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
+// Returns true if the encoding succeeds completely, false if there is at
+// least one failure.
+// If lengths is not nullptr, then it is filled with the corresponding
+// byte length of each encoded UNICHAR_ID.
+// If encoded_length is not nullptr then on return it contains the length of
+// str that was encoded. (if give_up_on_failure the location of the first
+// failure, otherwise strlen(str).)
+// WARNING: Caller must guarantee that str has already been cleaned of codes
+// that do not belong in the unicharset, or encoding may fail.
+// Use CleanupString to perform the cleaning.
+bool encode_string(const char *str, bool give_up_on_failure,
+std::vector<UNICHAR_ID> *encoding,
+std::vector<char> *lengths,
+unsigned *encoded_length) const;
+// Return the unichar representation corresponding to the given UNICHAR_ID
+// within the UNICHARSET.
+const char *id_to_unichar(UNICHAR_ID id) const;
+// Return the UTF8 representation corresponding to the given UNICHAR_ID after
+// resolving any private encodings internal to Tesseract. This method is
+// preferable to id_to_unichar for outputting text that will be visible to
+// external applications.
+const char *id_to_unichar_ext(UNICHAR_ID id) const;
+// Return a string that reformats the utf8 str into the str followed
+// by its hex unicodes.
+static std::string debug_utf8_str(const char *str);
+// Removes/replaces content that belongs in rendered text, but not in the
+// unicharset.
+static std::string CleanupString(const char *utf8_str) {
+return CleanupString(utf8_str, strlen(utf8_str));
+}
+static std::string CleanupString(const char *utf8_str, size_t length);
+// Return a string containing debug information on the unichar, including
+// the id_to_unichar, its hex unicodes and the properties.
+std::string debug_str(UNICHAR_ID id) const;
+std::string debug_str(const char *unichar_repr) const {
+return debug_str(unichar_to_id(unichar_repr));
+}
+// Adds a unichar representation to the set. If old_style is true, then
+// TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
+// characters are ignored/skipped as if they don't exist and n-grams that
+// can already be encoded are not added.
+void unichar_insert(const char *const unichar_repr,
+OldUncleanUnichars old_style);
+void unichar_insert(const char *const unichar_repr) {
+unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
+}
+// Adds a unichar representation to the set. Avoids setting old_style to true,
+// unless it is necessary to make the new unichar get added.
+void unichar_insert_backwards_compatible(const char *const unichar_repr) {
+std::string cleaned = CleanupString(unichar_repr);
+if (cleaned != unichar_repr) {
+unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
+} else {
+auto old_size = size();
+unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
+if (size() == old_size) {
+unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
+}
+}
+}
+// Return true if the given unichar id exists within the set.
+// Relies on the fact that unichar ids are contiguous in the unicharset.
+bool contains_unichar_id(UNICHAR_ID unichar_id) const {
+return static_cast<size_t>(unichar_id) < unichars.size();
+}
+// Return true if the given unichar representation exists within the set.
+bool contains_unichar(const char *const unichar_repr) const;
+bool contains_unichar(const char *const unichar_repr, int length) const;
+// Return true if the given unichar representation corresponds to the given
+// UNICHAR_ID within the set.
+bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;
+// Delete CHAR_FRAGMENTs stored in properties of unichars array.
+void delete_pointers_in_unichars() {
+for (auto &unichar : unichars) {
+delete unichar.properties.fragment;
+unichar.properties.fragment = nullptr;
+}
+}
+// Clear the UNICHARSET (all the previous data is lost).
+void clear() {
+if (script_table != nullptr) {
+for (int i = 0; i < script_table_size_used; ++i) {
+delete[] script_table[i];
+}
+delete[] script_table;
+script_table = nullptr;
+script_table_size_used = 0;
+}
+script_table_size_reserved = 0;
+delete_pointers_in_unichars();
+unichars.clear();
+ids.clear();
+top_bottom_set_ = false;
+script_has_upper_lower_ = false;
+script_has_xheight_ = false;
+old_style_included_ = false;
+null_sid_ = 0;
+common_sid_ = 0;
+latin_sid_ = 0;
+cyrillic_sid_ = 0;
+greek_sid_ = 0;
+han_sid_ = 0;
+hiragana_sid_ = 0;
+katakana_sid_ = 0;
+thai_sid_ = 0;
+hangul_sid_ = 0;
+default_sid_ = 0;
+}
+// Return the size of the set (the number of different UNICHAR it holds).
+size_t size() const {
+return unichars.size();
+}
+// Opens the file indicated by filename and saves unicharset to that file.
+// Returns true if the operation is successful.
+bool save_to_file(const char *const filename) const {
+FILE *file = fopen(filename, "w+b");
+if (file == nullptr) {
+return false;
+}
+bool result = save_to_file(file);
+fclose(file);
+return result;
+}
+// Saves the content of the UNICHARSET to the given file.
+// Returns true if the operation is successful.
+bool save_to_file(FILE *file) const {
+std::string str;
+return save_to_string(str) &&
+tesseract::Serialize(file, &str[0], str.length());
+}
+bool save_to_file(tesseract::TFile *file) const {
+std::string str;
+return save_to_string(str) && file->Serialize(&str[0], str.length());
+}
+// Saves the content of the UNICHARSET to the given string.
+// Returns true if the operation is successful.
+bool save_to_string(std::string &str) const;
+// Opens the file indicated by filename and loads the UNICHARSET
+// from the given file. The previous data is lost.
+// Returns true if the operation is successful.
+bool load_from_file(const char *const filename, bool skip_fragments) {
+FILE *file = fopen(filename, "rb");
+if (file == nullptr) {
+return false;
+}
+bool result = load_from_file(file, skip_fragments);
+fclose(file);
+return result;
+}
+// returns true if the operation is successful.
+bool load_from_file(const char *const filename) {
+return load_from_file(filename, false);
+}
+// Loads the UNICHARSET from the given file. The previous data is lost.
+// Returns true if the operation is successful.
+bool load_from_file(FILE *file, bool skip_fragments);
+bool load_from_file(FILE *file) {
+return load_from_file(file, false);
+}
+bool load_from_file(tesseract::TFile *file, bool skip_fragments);
+// Sets up internal data after loading the file, based on the char
+// properties. Called from load_from_file, but also needs to be run
+// during set_unicharset_properties.
+void post_load_setup();
+// Returns true if right_to_left scripts are significant in the unicharset,
+// but without being so sensitive that "universal" unicharsets containing
+// characters from many scripts, like orientation and script detection,
+// look like they are right_to_left.
+bool major_right_to_left() const;
+// Set a whitelist and/or blacklist of characters to recognize.
+// An empty or nullptr whitelist enables everything (minus any blacklist).
+// An empty or nullptr blacklist disables nothing.
+// An empty or nullptr unblacklist has no effect.
+// The blacklist overrides the whitelist.
+// The unblacklist overrides the blacklist.
+// Each list is a string of utf8 character strings. Boundaries between
+// unicharset units are worked out automatically, and characters not in
+// the unicharset are silently ignored.
+void set_black_and_whitelist(const char *blacklist, const char *whitelist,
+const char *unblacklist);
+// Set the isalpha property of the given unichar to the given value.
+void set_isalpha(UNICHAR_ID unichar_id, bool value) {
+unichars[unichar_id].properties.isalpha = value;
+}
+// Set the islower property of the given unichar to the given value.
+void set_islower(UNICHAR_ID unichar_id, bool value) {
+unichars[unichar_id].properties.islower = value;
+}
+// Set the isupper property of the given unichar to the given value.
+void set_isupper(UNICHAR_ID unichar_id, bool value) {
+unichars[unichar_id].properties.isupper = value;
+}
+// Set the isdigit property of the given unichar to the given value.
+void set_isdigit(UNICHAR_ID unichar_id, bool value) {
+unichars[unichar_id].properties.isdigit = value;
+}
+// Set the ispunctuation property of the given unichar to the given value.
+void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
+unichars[unichar_id].properties.ispunctuation = value;
+}
+// Set the isngram property of the given unichar to the given value.
+void set_isngram(UNICHAR_ID unichar_id, bool value) {
+unichars[unichar_id].properties.isngram = value;
+}
+// Set the script name of the given unichar to the given value.
+// Value is copied and thus can be a temporary;
+void set_script(UNICHAR_ID unichar_id, const char *value) {
+unichars[unichar_id].properties.script_id = add_script(value);
+}
+// Set other_case unichar id in the properties for the given unichar id.
+void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
+unichars[unichar_id].properties.other_case = other_case;
+}
+// Set the direction property of the given unichar to the given value.
+void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
+unichars[unichar_id].properties.direction = value;
+}
+// Set mirror unichar id in the properties for the given unichar id.
+void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
+unichars[unichar_id].properties.mirror = mirror;
+}
+// Record normalized version of unichar with the given unichar_id.
+void set_normed(UNICHAR_ID unichar_id, const char *normed) {
+unichars[unichar_id].properties.normed = normed;
+unichars[unichar_id].properties.normed_ids.clear();
+}
+// Sets the normed_ids vector from the normed string. normed_ids is not
+// stored in the file, and needs to be set when the UNICHARSET is loaded.
+void set_normed_ids(UNICHAR_ID unichar_id);
+// Return the isalpha property of the given unichar.
+bool get_isalpha(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return false;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.isalpha;
+}
+// Return the islower property of the given unichar.
+bool get_islower(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return false;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.islower;
+}
+// Return the isupper property of the given unichar.
+bool get_isupper(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return false;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.isupper;
+}
+// Return the isdigit property of the given unichar.
+bool get_isdigit(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return false;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.isdigit;
+}
+// Return the ispunctuation property of the given unichar.
+bool get_ispunctuation(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return false;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.ispunctuation;
+}
+// Return the isngram property of the given unichar.
+bool get_isngram(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return false;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.isngram;
+}
+// Returns whether the unichar id represents a unicode value in the private
+// use area.
+bool get_isprivate(UNICHAR_ID unichar_id) const;
+// Returns true if the ids have useful min/max top/bottom values.
+bool top_bottom_useful() const {
+return top_bottom_set_;
+}
+// Sets all ranges to empty, so they can be expanded to set the values.
+void set_ranges_empty();
+// Sets all the properties for this unicharset given a src_unicharset with
+// everything set. The unicharsets don't have to be the same, and graphemes
+// are correctly accounted for.
+void SetPropertiesFromOther(const UNICHARSET &src) {
+PartialSetPropertiesFromOther(0, src);
+}
+// Sets properties from Other, starting only at the given index.
+void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);
+// Expands the tops and bottoms and widths for this unicharset given a
+// src_unicharset with ranges in it. The unicharsets don't have to be the
+// same, and graphemes are correctly accounted for.
+void ExpandRangesFromOther(const UNICHARSET &src);
+// Makes this a copy of src. Clears this completely first, so the automattic
+// ids will not be present in this if not in src.
+void CopyFrom(const UNICHARSET &src);
+// For each id in src, if it does not occur in this, add it, as in
+// SetPropertiesFromOther, otherwise expand the ranges, as in
+// ExpandRangesFromOther.
+void AppendOtherUnicharset(const UNICHARSET &src);
+// Returns true if the acceptable ranges of the tops of the characters do
+// not overlap, making their x-height calculations distinct.
+bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
+// Returns the min and max bottom and top of the given unichar in
+// baseline-normalized coordinates, ie, where the baseline is
+// kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
+// (See normalis.h for the definitions).
+void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,
+int *min_top, int *max_top) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+*min_bottom = *min_top = 0;
+*max_bottom = *max_top = 256; // kBlnCellHeight
+return;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+*min_bottom = unichars[unichar_id].properties.min_bottom;
+*max_bottom = unichars[unichar_id].properties.max_bottom;
+*min_top = unichars[unichar_id].properties.min_top;
+*max_top = unichars[unichar_id].properties.max_top;
+}
+void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,
+int min_top, int max_top) {
+unichars[unichar_id].properties.min_bottom =
+ClipToRange<int>(min_bottom, 0, UINT8_MAX);
+unichars[unichar_id].properties.max_bottom =
+ClipToRange<int>(max_bottom, 0, UINT8_MAX);
+unichars[unichar_id].properties.min_top =
+ClipToRange<int>(min_top, 0, UINT8_MAX);
+unichars[unichar_id].properties.max_top =
+ClipToRange<int>(max_top, 0, UINT8_MAX);
+}
+// Returns the width stats (as mean, sd) of the given unichar relative to the
+// median advance of all characters in the character set.
+void get_width_stats(UNICHAR_ID unichar_id, float *width,
+float *width_sd) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+*width = 0.0f;
+*width_sd = 0.0f;
+return;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+*width = unichars[unichar_id].properties.width;
+*width_sd = unichars[unichar_id].properties.width_sd;
+}
+void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
+unichars[unichar_id].properties.width = width;
+unichars[unichar_id].properties.width_sd = width_sd;
+}
+// Returns the stats of the x-bearing (as mean, sd) of the given unichar
+// relative to the median advance of all characters in the character set.
+void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,
+float *bearing_sd) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+*bearing = *bearing_sd = 0.0f;
+return;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+*bearing = unichars[unichar_id].properties.bearing;
+*bearing_sd = unichars[unichar_id].properties.bearing_sd;
+}
+void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,
+float bearing_sd) {
+unichars[unichar_id].properties.bearing = bearing;
+unichars[unichar_id].properties.bearing_sd = bearing_sd;
+}
+// Returns the stats of the x-advance of the given unichar (as mean, sd)
+// relative to the median advance of all characters in the character set.
+void get_advance_stats(UNICHAR_ID unichar_id, float *advance,
+float *advance_sd) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+*advance = *advance_sd = 0;
+return;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+*advance = unichars[unichar_id].properties.advance;
+*advance_sd = unichars[unichar_id].properties.advance_sd;
+}
+void set_advance_stats(UNICHAR_ID unichar_id, float advance,
+float advance_sd) {
+unichars[unichar_id].properties.advance = advance;
+unichars[unichar_id].properties.advance_sd = advance_sd;
+}
+// Returns true if the font metrics properties are empty.
+bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
+return unichars[unichar_id].properties.AnyRangeEmpty();
+}
+// Returns true if the script of the given id is space delimited.
+// Returns false for Han and Thai scripts.
+bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return true;
+}
+int script_id = get_script(unichar_id);
+return script_id != han_sid_ && script_id != thai_sid_ &&
+script_id != hangul_sid_ && script_id != hiragana_sid_ &&
+script_id != katakana_sid_;
+}
+// Return the script name of the given unichar.
+// The returned pointer will always be the same for the same script, it's
+// managed by unicharset and thus MUST NOT be deleted
+int get_script(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return null_sid_;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.script_id;
+}
+// Return the character properties, eg. alpha/upper/lower/digit/punct,
+// as a bit field of unsigned int.
+unsigned int get_properties(UNICHAR_ID unichar_id) const;
+// Return the character property as a single char.  If a character has
+// multiple attributes, the main property is defined by the following order:
+//   upper_case : 'A'
+//   lower_case : 'a'
+//   alpha      : 'x'
+//   digit      : '0'
+//   punctuation: 'p'
+char get_chartype(UNICHAR_ID unichar_id) const;
+// Get other_case unichar id in the properties for the given unichar id.
+UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return INVALID_UNICHAR_ID;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.other_case;
+}
+// Returns the direction property of the given unichar.
+Direction get_direction(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return UNICHARSET::U_OTHER_NEUTRAL;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.direction;
+}
+// Get mirror unichar id in the properties for the given unichar id.
+UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return INVALID_UNICHAR_ID;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.mirror;
+}
+// Returns UNICHAR_ID of the corresponding lower-case unichar.
+UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return INVALID_UNICHAR_ID;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+if (unichars[unichar_id].properties.islower) {
+return unichar_id;
+}
+return unichars[unichar_id].properties.other_case;
+}
+// Returns UNICHAR_ID of the corresponding upper-case unichar.
+UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return INVALID_UNICHAR_ID;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+if (unichars[unichar_id].properties.isupper) {
+return unichar_id;
+}
+return unichars[unichar_id].properties.other_case;
+}
+// Returns true if this UNICHARSET has the special codes in
+// SpecialUnicharCodes available. If false then there are normal unichars
+// at these codes and they should not be used.
+bool has_special_codes() const {
+return get_fragment(UNICHAR_BROKEN) != nullptr &&
+strcmp(id_to_unichar(UNICHAR_BROKEN),
+kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
+}
+// Returns true if there are any repeated unicodes in the normalized
+// text of any unichar-id in the unicharset.
+bool AnyRepeatedUnicodes() const;
+// Return a pointer to the CHAR_FRAGMENT class if the given
+// unichar id represents a character fragment.
+const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
+if (INVALID_UNICHAR_ID == unichar_id) {
+return nullptr;
+}
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.fragment;
+}
+// Return the isalpha property of the given unichar representation.
+bool get_isalpha(const char *const unichar_repr) const {
+return get_isalpha(unichar_to_id(unichar_repr));
+}
+// Return the islower property of the given unichar representation.
+bool get_islower(const char *const unichar_repr) const {
+return get_islower(unichar_to_id(unichar_repr));
+}
+// Return the isupper property of the given unichar representation.
+bool get_isupper(const char *const unichar_repr) const {
+return get_isupper(unichar_to_id(unichar_repr));
+}
+// Return the isdigit property of the given unichar representation.
+bool get_isdigit(const char *const unichar_repr) const {
+return get_isdigit(unichar_to_id(unichar_repr));
+}
+// Return the ispunctuation property of the given unichar representation.
+bool get_ispunctuation(const char *const unichar_repr) const {
+return get_ispunctuation(unichar_to_id(unichar_repr));
+}
+// Return the character properties, eg. alpha/upper/lower/digit/punct,
+// of the given unichar representation
+unsigned int get_properties(const char *const unichar_repr) const {
+return get_properties(unichar_to_id(unichar_repr));
+}
+char get_chartype(const char *const unichar_repr) const {
+return get_chartype(unichar_to_id(unichar_repr));
+}
+// Return the script name of the given unichar representation.
+// The returned pointer will always be the same for the same script, it's
+// managed by unicharset and thus MUST NOT be deleted
+int get_script(const char *const unichar_repr) const {
+return get_script(unichar_to_id(unichar_repr));
+}
+// Return a pointer to the CHAR_FRAGMENT class struct if the given
+// unichar representation represents a character fragment.
+const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {
+if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
+!ids.contains(unichar_repr, false)) {
+return nullptr;
+}
+return get_fragment(unichar_to_id(unichar_repr));
+}
+// Return the isalpha property of the given unichar representation.
+// Only the first length characters from unichar_repr are used.
+bool get_isalpha(const char *const unichar_repr, int length) const {
+return get_isalpha(unichar_to_id(unichar_repr, length));
+}
+// Return the islower property of the given unichar representation.
+// Only the first length characters from unichar_repr are used.
+bool get_islower(const char *const unichar_repr, int length) const {
+return get_islower(unichar_to_id(unichar_repr, length));
+}
+// Return the isupper property of the given unichar representation.
+// Only the first length characters from unichar_repr are used.
+bool get_isupper(const char *const unichar_repr, int length) const {
+return get_isupper(unichar_to_id(unichar_repr, length));
+}
+// Return the isdigit property of the given unichar representation.
+// Only the first length characters from unichar_repr are used.
+bool get_isdigit(const char *const unichar_repr, int length) const {
+return get_isdigit(unichar_to_id(unichar_repr, length));
+}
+// Return the ispunctuation property of the given unichar representation.
+// Only the first length characters from unichar_repr are used.
+bool get_ispunctuation(const char *const unichar_repr, int length) const {
+return get_ispunctuation(unichar_to_id(unichar_repr, length));
+}
+// Returns normalized version of unichar with the given unichar_id.
+const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
+if (unichar_id == UNICHAR_SPACE) {
+return " ";
+}
+return unichars[unichar_id].properties.normed.c_str();
+}
+// Returns a vector of UNICHAR_IDs that represent the ids of the normalized
+// version of the given id. There may be more than one UNICHAR_ID in the
+// vector if unichar_id represents a ligature.
+const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {
+return unichars[unichar_id].properties.normed_ids;
+}
+// Return the script name of the given unichar representation.
+// Only the first length characters from unichar_repr are used.
+// The returned pointer will always be the same for the same script, it's
+// managed by unicharset and thus MUST NOT be deleted
+int get_script(const char *const unichar_repr, int length) const {
+return get_script(unichar_to_id(unichar_repr, length));
+}
+// Return the (current) number of scripts in the script table
+int get_script_table_size() const {
+return script_table_size_used;
+}
+// Return the script string from its id
+const char *get_script_from_script_id(int id) const {
+if (id >= script_table_size_used || id < 0) {
+return null_script;
+}
+return script_table[id];
+}
+// Returns the id from the name of the script, or 0 if script is not found.
+// Note that this is an expensive operation since it involves iteratively
+// comparing strings in the script table.  To avoid dependency on STL, we
+// won't use a hash.  Instead, the calling function can use this to lookup
+// and save the ID for relevant scripts for fast comparisons later.
+int get_script_id_from_name(const char *script_name) const;
+// Return true if the given script is the null script
+bool is_null_script(const char *script) const {
+return script == null_script;
+}
+// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
+// then the returned pointer will be the same.
+// The script parameter is copied and thus can be a temporary.
+int add_script(const char *script);
+// Return the enabled property of the given unichar.
+bool get_enabled(UNICHAR_ID unichar_id) const {
+ASSERT_HOST(contains_unichar_id(unichar_id));
+return unichars[unichar_id].properties.enabled;
+}
+int null_sid() const {
+return null_sid_;
+}
+int common_sid() const {
+return common_sid_;
+}
+int latin_sid() const {
+return latin_sid_;
+}
+int cyrillic_sid() const {
+return cyrillic_sid_;
+}
+int greek_sid() const {
+return greek_sid_;
+}
+int han_sid() const {
+return han_sid_;
+}
+int hiragana_sid() const {
+return hiragana_sid_;
+}
+int katakana_sid() const {
+return katakana_sid_;
+}
+int thai_sid() const {
+return thai_sid_;
+}
+int hangul_sid() const {
+return hangul_sid_;
+}
+int default_sid() const {
+return default_sid_;
+}
+// Returns true if the unicharset has the concept of upper/lower case.
+bool script_has_upper_lower() const {
+return script_has_upper_lower_;
+}
+// Returns true if the unicharset has the concept of x-height.
+// script_has_xheight can be true even if script_has_upper_lower is not,
+// when the script has a sufficiently predominant top line with ascenders,
+// such as Devanagari and Thai.
+bool script_has_xheight() const {
+return script_has_xheight_;
+}
+private:
+struct TESS_API UNICHAR_PROPERTIES {
+UNICHAR_PROPERTIES();
+// Initializes all properties to sensible default values.
+void Init();
+// Sets all ranges wide open. Initialization default in case there are
+// no useful values available.
+void SetRangesOpen();
+// Sets all ranges to empty. Used before expanding with font-based data.
+void SetRangesEmpty();
+// Returns true if any of the top/bottom/width/bearing/advance ranges/stats
+// is empty.
+bool AnyRangeEmpty() const;
+// Expands the ranges with the ranges from the src properties.
+void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);
+// Copies the properties from src into this.
+void CopyFrom(const UNICHAR_PROPERTIES &src);
+bool isalpha;
+bool islower;
+bool isupper;
+bool isdigit;
+bool ispunctuation;
+bool isngram;
+bool enabled;
+// Possible limits of the top and bottom of the bounding box in
+// baseline-normalized coordinates, ie, where the baseline is
+// kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
+// (See normalis.h for the definitions).
+uint8_t min_bottom;
+uint8_t max_bottom;
+uint8_t min_top;
+uint8_t max_top;
+// Statistics of the widths of bounding box, relative to the median advance.
+float width;
+float width_sd;
+// Stats of the x-bearing and advance, also relative to the median advance.
+float bearing;
+float bearing_sd;
+float advance;
+float advance_sd;
+int script_id;
+UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
+Direction direction;   // direction of this unichar
+// Mirror property is useful for reverse DAWG lookup for words in
+// right-to-left languages (e.g. "(word)" would be in
+// '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
+// However, what we want in our DAWG is
+// '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
+// '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
+UNICHAR_ID mirror;
+// A string of unichar_ids that represent the corresponding normed string.
+// For awkward characters like em-dash, this gives hyphen.
+// For ligatures, this gives the string of normal unichars.
+std::vector<UNICHAR_ID> normed_ids;
+std::string normed; // normalized version of this unichar
+// Contains meta information about the fragment if a unichar represents
+// a fragment of a character, otherwise should be set to nullptr.
+// It is assumed that character fragments are added to the unicharset
+// after the corresponding 'base' characters.
+CHAR_FRAGMENT *fragment;
+};
+struct UNICHAR_SLOT {
+char representation[UNICHAR_LEN + 1];
+UNICHAR_PROPERTIES properties;
+};
+// Internal recursive version of encode_string above.
+// str is the start of the whole string.
+// str_index is the current position in str.
+// str_length is the length of str.
+// encoding is a working encoding of str.
+// lengths is a working set of lengths of each element of encoding.
+// best_total_length is the longest length of str that has been successfully
+// encoded so far.
+// On return:
+// best_encoding contains the encoding that used the longest part of str.
+// best_lengths (may be null) contains the lengths of best_encoding.
+void encode_string(const char *str, int str_index, int str_length,
+std::vector<UNICHAR_ID> *encoding,
+std::vector<char> *lengths, unsigned *best_total_length,
+std::vector<UNICHAR_ID> *best_encoding,
+std::vector<char> *best_lengths) const;
+// Gets the properties for a grapheme string, combining properties for
+// multiple characters in a meaningful way where possible.
+// Returns false if no valid match was found in the unicharset.
+// NOTE that script_id, mirror, and other_case refer to this unicharset on
+// return and will need redirecting if the target unicharset is different.
+bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;
+// Load ourselves from a "file" where our only interface to the file is
+// an implementation of fgets().  This is the parsing primitive accessed by
+// the public routines load_from_file().
+bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,
+bool skip_fragments);
+// List of mappings to make when ingesting strings from the outside.
+// The substitutions clean up text that should exists for rendering of
+// synthetic data, but not in the recognition set.
+static const char *kCleanupMaps[][2];
+static const char *null_script;
+std::vector<UNICHAR_SLOT> unichars;
+UNICHARMAP ids;
+char **script_table;
+int script_table_size_used;
+int script_table_size_reserved;
+// True if the unichars have their tops/bottoms set.
+bool top_bottom_set_;
+// True if the unicharset has significant upper/lower case chars.
+bool script_has_upper_lower_;
+// True if the unicharset has a significant mean-line with significant
+// ascenders above that.
+bool script_has_xheight_;
+// True if the set contains chars that would be changed by the cleanup.
+bool old_style_included_;
+// A few convenient script name-to-id mapping without using hash.
+// These are initialized when unicharset file is loaded.  Anything
+// missing from this list can be looked up using get_script_id_from_name.
+int null_sid_;
+int common_sid_;
+int latin_sid_;
+int cyrillic_sid_;
+int greek_sid_;
+int han_sid_;
+int hiragana_sid_;
+int katakana_sid_;
+int thai_sid_;
+int hangul_sid_;
+// The most frequently occurring script in the charset.
+int default_sid_;
+};
+} // namespace tesseract
+#endif // TESSERACT_CCUTIL_UNICHARSET_H_

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.h @ 2:b50eed0cc0ef upstream