Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccutil/unicharset.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1186 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unicharset.cpp
+// Description: Unicode character/ligature set class.
+// Author:      Thomas Kielbus
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "unicharset.h"
+
+#include "params.h"
+
+#include <tesseract/unichar.h>
+#include "serialis.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <iomanip> // for std::setw
+#include <locale>  // for std::locale::classic
+#include <sstream> // for std::istringstream, std::ostringstream
+
+namespace tesseract {
+
+// Special character used in representing character fragments.
+static const char kSeparator = '|';
+// Special character used in representing 'natural' character fragments.
+static const char kNaturalFlag = 'n';
+
+static const int ISALPHA_MASK = 0x1;
+static const int ISLOWER_MASK = 0x2;
+static const int ISUPPER_MASK = 0x4;
+static const int ISDIGIT_MASK = 0x8;
+static const int ISPUNCTUATION_MASK = 0x10;
+
+// Y coordinate threshold for determining cap-height vs x-height.
+// TODO(rays) Bring the global definition down to the ccutil library level,
+// so this constant is relative to some other constants.
+static const int kMeanlineThreshold = 220;
+// Let C be the number of alpha chars for which all tops exceed
+// kMeanlineThreshold, and X the number of alpha chars for which all
+// tops are below kMeanlineThreshold, then if X > C *
+// kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
+// half the alpha characters have upper or lower case, then the
+// unicharset "has x-height".
+const double kMinXHeightFraction = 0.25;
+const double kMinCapHeightFraction = 0.05;
+
+/*static */
+const char *UNICHARSET::kCustomLigatures[][2] = {
+    {"ct", "\uE003"}, // c + t -> U+E003
+    {"ſh", "\uE006"}, // long-s + h -> U+E006
+    {"ſi", "\uE007"}, // long-s + i -> U+E007
+    {"ſl", "\uE008"}, // long-s + l -> U+E008
+    {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
+    {nullptr, nullptr}};
+
+// List of mappings to make when ingesting strings from the outside.
+// The substitutions clean up text that should exist for rendering of
+// synthetic data, but not in the recognition set.
+const char *UNICHARSET::kCleanupMaps[][2] = {
+    {"\u0640", ""},   // TATWEEL is deleted.
+    {"\ufb01", "fi"}, // fi ligature->fi pair.
+    {"\ufb02", "fl"}, // fl ligature->fl pair.
+    {nullptr, nullptr}};
+
+// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
+const char *UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
+    " ", "Joined", "|Broken|0|1"};
+
+const char *UNICHARSET::null_script = "NULL";
+
+UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
+  Init();
+}
+
+// Initialize all properties to sensible default values.
+void UNICHARSET::UNICHAR_PROPERTIES::Init() {
+  isalpha = false;
+  islower = false;
+  isupper = false;
+  isdigit = false;
+  ispunctuation = false;
+  isngram = false;
+  enabled = false;
+  SetRangesOpen();
+  script_id = 0;
+  other_case = 0;
+  mirror = 0;
+  normed = "";
+  direction = UNICHARSET::U_LEFT_TO_RIGHT;
+  fragment = nullptr;
+}
+
+// Sets all ranges wide open. Initialization default in case there are
+// no useful values available.
+void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
+  min_bottom = 0;
+  max_bottom = UINT8_MAX;
+  min_top = 0;
+  max_top = UINT8_MAX;
+  width = 0.0f;
+  width_sd = 0.0f;
+  bearing = 0.0f;
+  bearing_sd = 0.0f;
+  advance = 0.0f;
+  advance_sd = 0.0f;
+}
+
+// Sets all ranges to empty. Used before expanding with font-based data.
+void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
+  min_bottom = UINT8_MAX;
+  max_bottom = 0;
+  min_top = UINT8_MAX;
+  max_top = 0;
+  width = 0.0f;
+  width_sd = 0.0f;
+  bearing = 0.0f;
+  bearing_sd = 0.0f;
+  advance = 0.0f;
+  advance_sd = 0.0f;
+}
+
+// Returns true if any of the top/bottom/width/bearing/advance ranges/stats
+// is empty.
+bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
+  return width == 0.0f || advance == 0.0f;
+}
+
+// Expands the ranges with the ranges from the src properties.
+void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
+    const UNICHAR_PROPERTIES &src) {
+  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
+  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
+  UpdateRange(src.min_top, &min_top, &max_top);
+  UpdateRange(src.max_top, &min_top, &max_top);
+  if (src.width_sd > width_sd) {
+    width = src.width;
+    width_sd = src.width_sd;
+  }
+  if (src.bearing_sd > bearing_sd) {
+    bearing = src.bearing;
+    bearing_sd = src.bearing_sd;
+  }
+  if (src.advance_sd > advance_sd) {
+    advance = src.advance;
+    advance_sd = src.advance_sd;
+  }
+}
+
+// Copies the properties from src into this.
+void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {
+  // Apart from the fragment, everything else can be done with a default copy.
+  CHAR_FRAGMENT *saved_fragment = fragment;
+  *this = src; // Bitwise copy.
+  fragment = saved_fragment;
+}
+
+UNICHARSET::UNICHARSET()
+    : ids(), script_table(nullptr), script_table_size_used(0) {
+  clear();
+  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
+    unichar_insert(kSpecialUnicharCodes[i]);
+    if (i == UNICHAR_JOINED) {
+      set_isngram(i, true);
+    }
+  }
+}
+
+UNICHARSET::~UNICHARSET() {
+  clear();
+}
+
+UNICHAR_ID
+UNICHARSET::unichar_to_id(const char *const unichar_repr) const {
+  std::string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  return ids.contains(cleaned.data(), cleaned.size())
+             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
+             : INVALID_UNICHAR_ID;
+}
+
+UNICHAR_ID UNICHARSET::unichar_to_id(const char *const unichar_repr,
+                                     int length) const {
+  assert(length > 0 && length <= UNICHAR_LEN);
+  std::string cleaned(unichar_repr, length);
+  if (!old_style_included_) {
+    cleaned = CleanupString(unichar_repr, length);
+  }
+  return ids.contains(cleaned.data(), cleaned.size())
+             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
+             : INVALID_UNICHAR_ID;
+}
+
+// Return the minimum number of bytes that matches a legal UNICHAR_ID,
+// while leaving the rest of the string encodable. Returns 0 if the
+// beginning of the string is not encodable.
+// WARNING: this function now encodes the whole string for precision.
+// Use encode_string in preference to repeatedly calling step.
+int UNICHARSET::step(const char *str) const {
+  std::vector<UNICHAR_ID> encoding;
+  std::vector<char> lengths;
+  encode_string(str, true, &encoding, &lengths, nullptr);
+  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
+    return 0;
+  }
+  return lengths[0];
+}
+
+// Return whether the given UTF-8 string is encodable with this UNICHARSET.
+// If not encodable, write the first byte offset which cannot be converted
+// into the second (return) argument.
+bool UNICHARSET::encodable_string(const char *str,
+                                  unsigned *first_bad_position) const {
+  std::vector<UNICHAR_ID> encoding;
+  return encode_string(str, true, &encoding, nullptr, first_bad_position);
+}
+
+// Encodes the given UTF-8 string with this UNICHARSET.
+// Returns true if the encoding succeeds completely, false if there is at
+// least one INVALID_UNICHAR_ID in the returned encoding, but in this case
+// the rest of the string is still encoded.
+// If lengths is not nullptr, then it is filled with the corresponding
+// byte length of each encoded UNICHAR_ID.
+// WARNING: Caller must guarantee that str has already been cleaned of codes
+// that do not belong in the unicharset, or encoding may fail.
+// Use CleanupString to perform the cleaning.
+bool UNICHARSET::encode_string(const char *str, bool give_up_on_failure,
+                               std::vector<UNICHAR_ID> *encoding,
+                               std::vector<char> *lengths,
+                               unsigned *encoded_length) const {
+  std::vector<UNICHAR_ID> working_encoding;
+  std::vector<char> working_lengths;
+  std::vector<char> best_lengths;
+  encoding->clear(); // Just in case str is empty.
+  auto str_length = strlen(str);
+  unsigned str_pos = 0;
+  bool perfect = true;
+  while (str_pos < str_length) {
+    encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
+                  &str_pos, encoding, &best_lengths);
+    if (str_pos < str_length) {
+      // This is a non-match. Skip one utf-8 character.
+      perfect = false;
+      if (give_up_on_failure) {
+        break;
+      }
+      int step = UNICHAR::utf8_step(str + str_pos);
+      if (step == 0) {
+        step = 1;
+      }
+      encoding->push_back(INVALID_UNICHAR_ID);
+      best_lengths.push_back(step);
+      str_pos += step;
+      working_encoding = *encoding;
+      working_lengths = best_lengths;
+    }
+  }
+  if (lengths != nullptr) {
+    *lengths = std::move(best_lengths);
+  }
+  if (encoded_length != nullptr) {
+    *encoded_length = str_pos;
+  }
+  return perfect;
+}
+
+const char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
+  if (id == INVALID_UNICHAR_ID) {
+    return INVALID_UNICHAR;
+  }
+  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
+  return unichars[id].representation;
+}
+
+const char *UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
+  if (id == INVALID_UNICHAR_ID) {
+    return INVALID_UNICHAR;
+  }
+  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
+  // Resolve from the kCustomLigatures table if this is a private encoding.
+  if (get_isprivate(id)) {
+    const char *ch = id_to_unichar(id);
+    for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
+      if (!strcmp(ch, kCustomLigatures[i][1])) {
+        return kCustomLigatures[i][0];
+      }
+    }
+  }
+  // Otherwise return the stored representation.
+  return unichars[id].representation;
+}
+
+// Return a string that reformats the utf8 str into the str followed
+// by its hex unicodes.
+std::string UNICHARSET::debug_utf8_str(const char *str) {
+  std::string result = str;
+  result += " [";
+  int step = 1;
+  // Chop into unicodes and code each as hex.
+  for (int i = 0; str[i] != '\0'; i += step) {
+    char hex[sizeof(int) * 2 + 1];
+    step = UNICHAR::utf8_step(str + i);
+    if (step == 0) {
+      step = 1;
+      snprintf(hex, sizeof(hex), "%x", str[i]);
+    } else {
+      UNICHAR ch(str + i, step);
+      snprintf(hex, sizeof(hex), "%x", ch.first_uni());
+    }
+    result += hex;
+    result += " ";
+  }
+  result += "]";
+  return result;
+}
+
+// Return a string containing debug information on the unichar, including
+// the id_to_unichar, its hex unicodes and the properties.
+std::string UNICHARSET::debug_str(UNICHAR_ID id) const {
+  if (id == INVALID_UNICHAR_ID) {
+    return std::string(id_to_unichar(id));
+  }
+  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
+  if (fragment) {
+    return fragment->to_string();
+  }
+  const char *str = id_to_unichar(id);
+  std::string result = debug_utf8_str(str);
+  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
+  if (get_isalpha(id)) {
+    if (get_islower(id)) {
+      result += "a";
+    } else if (get_isupper(id)) {
+      result += "A";
+    } else {
+      result += "x";
+    }
+  }
+  // Append 0 if a digit.
+  if (get_isdigit(id)) {
+    result += "0";
+  }
+  // Append p is a punctuation symbol.
+  if (get_ispunctuation(id)) {
+    result += "p";
+  }
+  return result;
+}
+
+// Sets the normed_ids vector from the normed string. normed_ids is not
+// stored in the file, and needs to be set when the UNICHARSET is loaded.
+void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
+  unichars[unichar_id].properties.normed_ids.clear();
+  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
+    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
+  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
+                            true, &unichars[unichar_id].properties.normed_ids,
+                            nullptr, nullptr)) {
+    unichars[unichar_id].properties.normed_ids.clear();
+    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
+  }
+}
+
+// Returns whether the unichar id represents a unicode value in the private use
+// area. We use this range only internally to represent uncommon ligatures
+// (eg. 'ct') that do not have regular unicode values.
+bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
+  UNICHAR uc(id_to_unichar(unichar_id), -1);
+  int uni = uc.first_uni();
+  return (uni >= 0xE000 && uni <= 0xF8FF);
+}
+
+// Sets all ranges to empty, so they can be expanded to set the values.
+void UNICHARSET::set_ranges_empty() {
+  for (auto &uc : unichars) {
+    uc.properties.SetRangesEmpty();
+  }
+}
+
+// Sets all the properties for this unicharset given a src unicharset with
+// everything set. The unicharsets don't have to be the same, and graphemes
+// are correctly accounted for.
+void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
+                                               const UNICHARSET &src) {
+  for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
+    const char *utf8 = id_to_unichar(ch);
+    UNICHAR_PROPERTIES properties;
+    if (src.GetStrProperties(utf8, &properties)) {
+      // Setup the script_id, other_case, and mirror properly.
+      const char *script = src.get_script_from_script_id(properties.script_id);
+      properties.script_id = add_script(script);
+      const char *other_case = src.id_to_unichar(properties.other_case);
+      if (contains_unichar(other_case)) {
+        properties.other_case = unichar_to_id(other_case);
+      } else {
+        properties.other_case = ch;
+      }
+      const char *mirror_str = src.id_to_unichar(properties.mirror);
+      if (contains_unichar(mirror_str)) {
+        properties.mirror = unichar_to_id(mirror_str);
+      } else {
+        properties.mirror = ch;
+      }
+      unichars[ch].properties.CopyFrom(properties);
+      set_normed_ids(ch);
+    }
+  }
+}
+
+// Expands the tops and bottoms and widths for this unicharset given a
+// src unicharset with ranges in it. The unicharsets don't have to be the
+// same, and graphemes are correctly accounted for.
+void UNICHARSET::ExpandRangesFromOther(const UNICHARSET &src) {
+  for (unsigned ch = 0; ch < unichars.size(); ++ch) {
+    const char *utf8 = id_to_unichar(ch);
+    UNICHAR_PROPERTIES properties;
+    if (src.GetStrProperties(utf8, &properties)) {
+      // Expand just the ranges from properties.
+      unichars[ch].properties.ExpandRangesFrom(properties);
+    }
+  }
+}
+
+// Makes this a copy of src. Clears this completely first, so the automatic
+// ids will not be present in this if not in src. Does NOT reorder the set!
+void UNICHARSET::CopyFrom(const UNICHARSET &src) {
+  clear();
+  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
+    const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
+    const char *utf8 = src.id_to_unichar(ch);
+    unichar_insert_backwards_compatible(utf8);
+    unichars[ch].properties.ExpandRangesFrom(src_props);
+  }
+  // Set properties, including mirror and other_case, WITHOUT reordering
+  // the unicharset.
+  PartialSetPropertiesFromOther(0, src);
+}
+
+// For each id in src, if it does not occur in this, add it, as in
+// SetPropertiesFromOther, otherwise expand the ranges, as in
+// ExpandRangesFromOther.
+void UNICHARSET::AppendOtherUnicharset(const UNICHARSET &src) {
+  int initial_used = unichars.size();
+  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
+    const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
+    const char *utf8 = src.id_to_unichar(ch);
+    int id = unichars.size();
+    if (contains_unichar(utf8)) {
+      id = unichar_to_id(utf8);
+      // Just expand current ranges.
+      unichars[id].properties.ExpandRangesFrom(src_props);
+    } else {
+      unichar_insert_backwards_compatible(utf8);
+      unichars[id].properties.SetRangesEmpty();
+    }
+  }
+  // Set properties, including mirror and other_case, WITHOUT reordering
+  // the unicharset.
+  PartialSetPropertiesFromOther(initial_used, src);
+}
+
+// Returns true if the acceptable ranges of the tops of the characters do
+// not overlap, making their x-height calculations distinct.
+bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
+  int overlap = std::min(unichars[id1].properties.max_top,
+                         unichars[id2].properties.max_top) -
+                std::max(unichars[id1].properties.min_top,
+                         unichars[id2].properties.min_top);
+  return overlap <= 0;
+}
+
+// Internal recursive version of encode_string above.
+// Seeks to encode the given string as a sequence of UNICHAR_IDs such that
+// each UNICHAR_ID uses the least possible part of the utf8 str.
+// It does this by depth-first tail recursion on increasing length matches
+// to the UNICHARSET, saving the first encountered result that encodes the
+// maximum total length of str. It stops on a failure to encode to make
+// the overall process of encoding a partially failed string more efficient.
+// See unicharset.h for definition of the args.
+void UNICHARSET::encode_string(const char *str, int str_index, int str_length,
+                               std::vector<UNICHAR_ID> *encoding,
+                               std::vector<char> *lengths,
+                               unsigned *best_total_length,
+                               std::vector<UNICHAR_ID> *best_encoding,
+                               std::vector<char> *best_lengths) const {
+  if (str_index > static_cast<int>(*best_total_length)) {
+    // This is the best result so far.
+    *best_total_length = str_index;
+    *best_encoding = *encoding;
+    if (best_lengths != nullptr) {
+      *best_lengths = *lengths;
+    }
+  }
+  if (str_index == str_length) {
+    return;
+  }
+  int encoding_index = encoding->size();
+  // Find the length of the first matching unicharset member.
+  int length = ids.minmatch(str + str_index);
+  if (length == 0 || str_index + length > str_length) {
+    return;
+  }
+  do {
+    if (ids.contains(str + str_index, length)) {
+      // Successful encoding so far.
+      UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
+      encoding->push_back(id);
+      lengths->push_back(length);
+      encode_string(str, str_index + length, str_length, encoding, lengths,
+                    best_total_length, best_encoding, best_lengths);
+      if (static_cast<int>(*best_total_length) == str_length) {
+        return; // Tail recursion success!
+      }
+      // Failed with that length, truncate back and try again.
+      encoding->resize(encoding_index);
+      lengths->resize(encoding_index);
+    }
+    int step = UNICHAR::utf8_step(str + str_index + length);
+    if (step == 0) {
+      step = 1;
+    }
+    length += step;
+  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
+}
+
+// Gets the properties for a grapheme string, combining properties for
+// multiple characters in a meaningful way where possible.
+// Returns false if no valid match was found in the unicharset.
+// NOTE that script_id, mirror, and other_case refer to this unicharset on
+// return and will need translation if the target unicharset is different.
+bool UNICHARSET::GetStrProperties(const char *utf8_str,
+                                  UNICHAR_PROPERTIES *props) const {
+  props->Init();
+  props->SetRangesEmpty();
+  int total_unicodes = 0;
+  std::vector<UNICHAR_ID> encoding;
+  if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr)) {
+    return false; // Some part was invalid.
+  }
+  for (auto it : encoding) {
+    int id = it;
+    const UNICHAR_PROPERTIES &src_props = unichars[id].properties;
+    // Logical OR all the bools.
+    if (src_props.isalpha) {
+      props->isalpha = true;
+    }
+    if (src_props.islower) {
+      props->islower = true;
+    }
+    if (src_props.isupper) {
+      props->isupper = true;
+    }
+    if (src_props.isdigit) {
+      props->isdigit = true;
+    }
+    if (src_props.ispunctuation) {
+      props->ispunctuation = true;
+    }
+    if (src_props.isngram) {
+      props->isngram = true;
+    }
+    if (src_props.enabled) {
+      props->enabled = true;
+    }
+    // Min/max the tops/bottoms.
+    UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
+    UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
+    UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
+    UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
+    float bearing = props->advance + src_props.bearing;
+    if (total_unicodes == 0 || bearing < props->bearing) {
+      props->bearing = bearing;
+      props->bearing_sd = props->advance_sd + src_props.bearing_sd;
+    }
+    props->advance += src_props.advance;
+    props->advance_sd += src_props.advance_sd;
+    // With a single width, just use the widths stored in the unicharset.
+    props->width = src_props.width;
+    props->width_sd = src_props.width_sd;
+    // Use the first script id, other_case, mirror, direction.
+    // Note that these will need translation, except direction.
+    if (total_unicodes == 0) {
+      props->script_id = src_props.script_id;
+      props->other_case = src_props.other_case;
+      props->mirror = src_props.mirror;
+      props->direction = src_props.direction;
+    }
+    // The normed string for the compound character is the concatenation of
+    // the normed versions of the individual characters.
+    props->normed += src_props.normed;
+    ++total_unicodes;
+  }
+  if (total_unicodes > 1) {
+    // Estimate the total widths from the advance - bearing.
+    props->width = props->advance - props->bearing;
+    props->width_sd = props->advance_sd + props->bearing_sd;
+  }
+  return total_unicodes > 0;
+}
+
+// TODO(rays) clean-up the order of functions to match unicharset.h.
+
+unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
+  unsigned int properties = 0;
+  if (this->get_isalpha(id)) {
+    properties |= ISALPHA_MASK;
+  }
+  if (this->get_islower(id)) {
+    properties |= ISLOWER_MASK;
+  }
+  if (this->get_isupper(id)) {
+    properties |= ISUPPER_MASK;
+  }
+  if (this->get_isdigit(id)) {
+    properties |= ISDIGIT_MASK;
+  }
+  if (this->get_ispunctuation(id)) {
+    properties |= ISPUNCTUATION_MASK;
+  }
+  return properties;
+}
+
+char UNICHARSET::get_chartype(UNICHAR_ID id) const {
+  if (this->get_isupper(id)) {
+    return 'A';
+  }
+  if (this->get_islower(id)) {
+    return 'a';
+  }
+  if (this->get_isalpha(id)) {
+    return 'x';
+  }
+  if (this->get_isdigit(id)) {
+    return '0';
+  }
+  if (this->get_ispunctuation(id)) {
+    return 'p';
+  }
+  return 0;
+}
+
+void UNICHARSET::unichar_insert(const char *const unichar_repr,
+                                OldUncleanUnichars old_style) {
+  if (old_style == OldUncleanUnichars::kTrue) {
+    old_style_included_ = true;
+  }
+  std::string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
+    const char *str = cleaned.c_str();
+    std::vector<int> encoding;
+    if (!old_style_included_ &&
+        encode_string(str, true, &encoding, nullptr, nullptr)) {
+      return;
+    }
+    unichars.emplace_back();
+    auto &u = unichars.back();
+    int index = 0;
+    do {
+      if (index >= UNICHAR_LEN) {
+        fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
+                unichar_repr);
+        return;
+      }
+      u.representation[index++] = *str++;
+    } while (*str != '\0');
+    u.representation[index] = '\0';
+    this->set_script(unichars.size() - 1, null_script);
+    // If the given unichar_repr represents a fragmented character, set
+    // fragment property to a pointer to CHAR_FRAGMENT class instance with
+    // information parsed from the unichar representation. Use the script
+    // of the base unichar for the fragmented character if possible.
+    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
+    u.properties.fragment = frag;
+    if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
+      u.properties.script_id = this->get_script(frag->get_unichar());
+    }
+    u.properties.enabled = true;
+    ids.insert(u.representation, unichars.size() - 1);
+  }
+}
+
+bool UNICHARSET::contains_unichar(const char *const unichar_repr) const {
+  std::string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  return ids.contains(cleaned.data(), cleaned.size());
+}
+
+bool UNICHARSET::contains_unichar(const char *const unichar_repr,
+                                  int length) const {
+  if (length == 0) {
+    return false;
+  }
+  std::string cleaned(unichar_repr, length);
+  if (!old_style_included_) {
+    cleaned = CleanupString(unichar_repr, length);
+  }
+  return ids.contains(cleaned.data(), cleaned.size());
+}
+
+bool UNICHARSET::eq(UNICHAR_ID unichar_id,
+                    const char *const unichar_repr) const {
+  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
+}
+
+bool UNICHARSET::save_to_string(std::string &str) const {
+  const int kFileBufSize = 1024;
+  char buffer[kFileBufSize + 1];
+  snprintf(buffer, kFileBufSize, "%zu\n", this->size());
+  str = buffer;
+  for (unsigned id = 0; id < this->size(); ++id) {
+    int min_bottom, max_bottom, min_top, max_top;
+    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
+    float width, width_sd;
+    get_width_stats(id, &width, &width_sd);
+    float bearing, bearing_sd;
+    get_bearing_stats(id, &bearing, &bearing_sd);
+    float advance, advance_sd;
+    get_advance_stats(id, &advance, &advance_sd);
+    unsigned int properties = this->get_properties(id);
+    if (strcmp(this->id_to_unichar(id), " ") == 0) {
+      snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
+               this->get_script_from_script_id(this->get_script(id)),
+               this->get_other_case(id));
+      str += buffer;
+    } else {
+      std::ostringstream stream;
+      stream.imbue(std::locale::classic());
+      stream << this->id_to_unichar(id) << ' ' << properties << ' '
+             << min_bottom << ',' << max_bottom << ',' << min_top << ','
+             << max_top << ',' << width << ',' << width_sd << ',' << bearing
+             << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
+             << this->get_script_from_script_id(this->get_script(id)) << ' '
+             << this->get_other_case(id) << ' ' << this->get_direction(id)
+             << ' ' << this->get_mirror(id) << ' '
+             << this->get_normed_unichar(id) << "\t# "
+             << this->debug_str(id).c_str() << '\n';
+      str += stream.str().c_str();
+    }
+  }
+  return true;
+}
+
+class LocalFilePointer {
+public:
+  LocalFilePointer(FILE *stream) : fp_(stream) {}
+  char *fgets(char *dst, int size) {
+    return ::fgets(dst, size, fp_);
+  }
+
+private:
+  FILE *fp_;
+};
+
+bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
+  LocalFilePointer lfp(file);
+  using namespace std::placeholders; // for _1, _2
+  std::function<char *(char *, int)> fgets_cb =
+      std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
+  bool success = load_via_fgets(fgets_cb, skip_fragments);
+  return success;
+}
+
+bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
+  using namespace std::placeholders; // for _1, _2
+  std::function<char *(char *, int)> fgets_cb =
+      std::bind(&tesseract::TFile::FGets, file, _1, _2);
+  bool success = load_via_fgets(fgets_cb, skip_fragments);
+  return success;
+}
+
+bool UNICHARSET::load_via_fgets(
+    const std::function<char *(char *, int)> &fgets_cb, bool skip_fragments) {
+  int unicharset_size;
+  char buffer[256];
+
+  this->clear();
+  if (fgets_cb(buffer, sizeof(buffer)) == nullptr ||
+      sscanf(buffer, "%d", &unicharset_size) != 1) {
+    return false;
+  }
+  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
+    char unichar[256];
+    unsigned int properties;
+    char script[64];
+
+    strncpy(script, null_script, sizeof(script) - 1);
+    int min_bottom = 0;
+    int max_bottom = UINT8_MAX;
+    int min_top = 0;
+    int max_top = UINT8_MAX;
+    float width = 0.0f;
+    float width_sd = 0.0f;
+    float bearing = 0.0f;
+    float bearing_sd = 0.0f;
+    float advance = 0.0f;
+    float advance_sd = 0.0f;
+    // TODO(eger): check that this default it ok
+    // after enabling BiDi iterator for Arabic.
+    int direction = UNICHARSET::U_LEFT_TO_RIGHT;
+    UNICHAR_ID other_case = unicharset_size;
+    UNICHAR_ID mirror = unicharset_size;
+    if (fgets_cb(buffer, sizeof(buffer)) == nullptr) {
+      return false;
+    }
+    char normed[64];
+    normed[0] = '\0';
+    std::istringstream stream(buffer);
+    stream.imbue(std::locale::classic());
+    // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标  # 标 [6807 ]x
+    // stream.flags(std::ios::hex);
+    stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
+    // stream.flags(std::ios::dec);
+    if (stream.fail()) {
+      fprintf(stderr, "%s:%d failed\n", __FILE__, __LINE__);
+      return false;
+    }
+    auto position = stream.tellg();
+    stream.seekg(position);
+    char c1, c2, c3, c4, c5, c6, c7, c8, c9;
+    stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
+        max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
+        bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
+        script >> other_case >> direction >> mirror >> std::setw(63) >> normed;
+    if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
+        c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
+      stream.clear();
+      stream.seekg(position);
+      stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
+          max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
+          bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
+          script >> other_case >> direction >> mirror;
+      if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
+          c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
+        stream.clear();
+        stream.seekg(position);
+        stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
+            max_top >> std::setw(63) >> script >> other_case >> direction >>
+            mirror;
+        if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
+          stream.clear();
+          stream.seekg(position);
+          stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
+              max_top >> std::setw(63) >> script >> other_case;
+          if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
+            stream.clear();
+            stream.seekg(position);
+            stream >> std::setw(63) >> script >> other_case;
+            if (stream.fail()) {
+              stream.clear();
+              stream.seekg(position);
+              stream >> std::setw(63) >> script;
+            }
+          }
+        }
+      }
+    }
+
+    // Skip fragments if needed.
+    CHAR_FRAGMENT *frag = nullptr;
+    if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
+      int num_pieces = frag->get_total();
+      delete frag;
+      // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
+      if (num_pieces > 1) {
+        continue;
+      }
+    }
+    // Insert unichar into unicharset and set its properties.
+    if (strcmp(unichar, "NULL") == 0) {
+      this->unichar_insert(" ");
+    } else {
+      this->unichar_insert_backwards_compatible(unichar);
+    }
+
+    this->set_isalpha(id, properties & ISALPHA_MASK);
+    this->set_islower(id, properties & ISLOWER_MASK);
+    this->set_isupper(id, properties & ISUPPER_MASK);
+    this->set_isdigit(id, properties & ISDIGIT_MASK);
+    this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
+    this->set_isngram(id, false);
+    this->set_script(id, script);
+    this->unichars[id].properties.enabled = true;
+    this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
+    this->set_width_stats(id, width, width_sd);
+    this->set_bearing_stats(id, bearing, bearing_sd);
+    this->set_advance_stats(id, advance, advance_sd);
+    this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
+    this->set_other_case(id, (other_case < unicharset_size) ? other_case : id);
+    this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
+    this->set_normed(id, normed[0] != '\0' ? normed : unichar);
+  }
+  post_load_setup();
+  return true;
+}
+
+// Sets up internal data after loading the file, based on the char
+// properties. Called from load_from_file, but also needs to be run
+// during set_unicharset_properties.
+void UNICHARSET::post_load_setup() {
+  // Number of alpha chars with the case property minus those without,
+  // in order to determine that half the alpha chars have case.
+  int net_case_alphas = 0;
+  int x_height_alphas = 0;
+  int cap_height_alphas = 0;
+  top_bottom_set_ = false;
+  for (unsigned id = 0; id < unichars.size(); ++id) {
+    int min_bottom = 0;
+    int max_bottom = UINT8_MAX;
+    int min_top = 0;
+    int max_top = UINT8_MAX;
+    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
+    if (min_top > 0) {
+      top_bottom_set_ = true;
+    }
+    if (get_isalpha(id)) {
+      if (get_islower(id) || get_isupper(id)) {
+        ++net_case_alphas;
+      } else {
+        --net_case_alphas;
+      }
+      if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
+        ++x_height_alphas;
+      } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
+        ++cap_height_alphas;
+      }
+    }
+    set_normed_ids(id);
+  }
+
+  script_has_upper_lower_ = net_case_alphas > 0;
+  script_has_xheight_ =
+      script_has_upper_lower_ ||
+      (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
+       cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
+
+  null_sid_ = get_script_id_from_name(null_script);
+  ASSERT_HOST(null_sid_ == 0);
+  common_sid_ = get_script_id_from_name("Common");
+  latin_sid_ = get_script_id_from_name("Latin");
+  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
+  greek_sid_ = get_script_id_from_name("Greek");
+  han_sid_ = get_script_id_from_name("Han");
+  hiragana_sid_ = get_script_id_from_name("Hiragana");
+  katakana_sid_ = get_script_id_from_name("Katakana");
+  thai_sid_ = get_script_id_from_name("Thai");
+  hangul_sid_ = get_script_id_from_name("Hangul");
+
+  // Compute default script. Use the highest-counting alpha script, that is
+  // not the common script, as that still contains some "alphas".
+  int *script_counts = new int[script_table_size_used];
+  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
+  for (unsigned id = 0; id < unichars.size(); ++id) {
+    if (get_isalpha(id)) {
+      ++script_counts[get_script(id)];
+    }
+  }
+  default_sid_ = 0;
+  for (int s = 1; s < script_table_size_used; ++s) {
+    if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
+      default_sid_ = s;
+    }
+  }
+  delete[] script_counts;
+}
+
+// Returns true if right_to_left scripts are significant in the unicharset,
+// but without being so sensitive that "universal" unicharsets containing
+// characters from many scripts, like orientation and script detection,
+// look like they are right_to_left.
+bool UNICHARSET::major_right_to_left() const {
+  int ltr_count = 0;
+  int rtl_count = 0;
+  for (unsigned id = 0; id < unichars.size(); ++id) {
+    int dir = get_direction(id);
+    if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
+      ltr_count++;
+    }
+    if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
+        dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
+        dir == UNICHARSET::U_ARABIC_NUMBER) {
+      rtl_count++;
+    }
+  }
+  return rtl_count > ltr_count;
+}
+
+// Set a whitelist and/or blacklist of characters to recognize.
+// An empty or nullptr whitelist enables everything (minus any blacklist).
+// An empty or nullptr blacklist disables nothing.
+// An empty or nullptr unblacklist has no effect.
+void UNICHARSET::set_black_and_whitelist(const char *blacklist,
+                                         const char *whitelist,
+                                         const char *unblacklist) {
+  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
+  // Set everything to default
+  for (auto &uc : unichars) {
+    uc.properties.enabled = def_enabled;
+  }
+  if (!def_enabled) {
+    // Enable the whitelist.
+    std::vector<UNICHAR_ID> encoding;
+    encode_string(whitelist, false, &encoding, nullptr, nullptr);
+    for (auto it : encoding) {
+      if (it != INVALID_UNICHAR_ID) {
+        unichars[it].properties.enabled = true;
+      }
+    }
+  }
+  if (blacklist != nullptr && blacklist[0] != '\0') {
+    // Disable the blacklist.
+    std::vector<UNICHAR_ID> encoding;
+    encode_string(blacklist, false, &encoding, nullptr, nullptr);
+    for (auto it : encoding) {
+      if (it != INVALID_UNICHAR_ID) {
+        unichars[it].properties.enabled = false;
+      }
+    }
+  }
+  if (unblacklist != nullptr && unblacklist[0] != '\0') {
+    // Re-enable the unblacklist.
+    std::vector<UNICHAR_ID> encoding;
+    encode_string(unblacklist, false, &encoding, nullptr, nullptr);
+    for (auto it : encoding) {
+      if (it != INVALID_UNICHAR_ID) {
+        unichars[it].properties.enabled = true;
+      }
+    }
+  }
+}
+
+// Returns true if there are any repeated unicodes in the normalized
+// text of any unichar-id in the unicharset.
+bool UNICHARSET::AnyRepeatedUnicodes() const {
+  int start_id = 0;
+  if (has_special_codes()) {
+    start_id = SPECIAL_UNICHAR_CODES_COUNT;
+  }
+  for (unsigned id = start_id; id < unichars.size(); ++id) {
+    // Convert to unicodes.
+    std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
+    for (size_t u = 1; u < unicodes.size(); ++u) {
+      if (unicodes[u - 1] == unicodes[u]) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+int UNICHARSET::add_script(const char *script) {
+  for (int i = 0; i < script_table_size_used; ++i) {
+    if (strcmp(script, script_table[i]) == 0) {
+      return i;
+    }
+  }
+  if (script_table_size_reserved == 0) {
+    script_table_size_reserved = 8;
+    script_table = new char *[script_table_size_reserved];
+  } else if (script_table_size_used >= script_table_size_reserved) {
+    assert(script_table_size_used == script_table_size_reserved);
+    script_table_size_reserved += script_table_size_reserved;
+    char **new_script_table = new char *[script_table_size_reserved];
+    memcpy(new_script_table, script_table,
+           script_table_size_used * sizeof(char *));
+    delete[] script_table;
+    script_table = new_script_table;
+  }
+  script_table[script_table_size_used] = new char[strlen(script) + 1];
+  strcpy(script_table[script_table_size_used], script);
+  return script_table_size_used++;
+}
+
+// Returns the string that represents a fragment
+// with the given unichar, pos and total.
+std::string CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
+                                     bool natural) {
+  if (total == 1) {
+    return std::string(unichar);
+  }
+  std::string result;
+  result += kSeparator;
+  result += unichar;
+  char buffer[kMaxLen];
+  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
+           natural ? kNaturalFlag : kSeparator, total);
+  result += buffer;
+  return result;
+}
+
+CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
+  const char *ptr = string;
+  int len = strlen(string);
+  if (len < kMinLen || *ptr != kSeparator) {
+    return nullptr; // this string cannot represent a fragment
+  }
+  ptr++; // move to the next character
+  int step = 0;
+  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
+    step += UNICHAR::utf8_step(ptr + step);
+  }
+  if (step == 0 || step > UNICHAR_LEN) {
+    return nullptr; // no character for unichar or the character is too long
+  }
+  char unichar[UNICHAR_LEN + 1];
+  strncpy(unichar, ptr, step);
+  unichar[step] = '\0'; // null terminate unichar
+  ptr += step;          // move to the next fragment separator
+  int pos = 0;
+  int total = 0;
+  bool natural = false;
+  char *end_ptr = nullptr;
+  for (int i = 0; i < 2; i++) {
+    if (ptr > string + len || *ptr != kSeparator) {
+      if (i == 1 && *ptr == kNaturalFlag) {
+        natural = true;
+      } else {
+        return nullptr; // Failed to parse fragment representation.
+      }
+    }
+    ptr++; // move to the next character
+    i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
+           : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
+    ptr = end_ptr;
+  }
+  if (ptr != string + len) {
+    return nullptr; // malformed fragment representation
+  }
+  auto *fragment = new CHAR_FRAGMENT();
+  fragment->set_all(unichar, pos, total, natural);
+  return fragment;
+}
+
+int UNICHARSET::get_script_id_from_name(const char *script_name) const {
+  for (int i = 0; i < script_table_size_used; ++i) {
+    if (strcmp(script_name, script_table[i]) == 0) {
+      return i;
+    }
+  }
+  return 0; // 0 is always the null_script
+}
+
+// Removes/replaces content that belongs in rendered text, but not in the
+// unicharset.
+/* static */
+std::string UNICHARSET::CleanupString(const char *utf8_str, size_t length) {
+  std::string result;
+  result.reserve(length);
+  char ch;
+  while ((ch = *utf8_str) != '\0' && length-- > 0) {
+    int key_index = 0;
+    const char *key;
+    while ((key = kCleanupMaps[key_index][0]) != nullptr) {
+      int match = 0;
+      while (key[match] != '\0' && key[match] == utf8_str[match]) {
+        ++match;
+      }
+      if (key[match] == '\0') {
+        utf8_str += match;
+        break;
+      }
+      ++key_index;
+    }
+    if (key == nullptr) {
+      result.push_back(ch);
+      ++utf8_str;
+    } else {
+      result.append(kCleanupMaps[key_index][1]);
+    }
+  }
+  return result;
+}
+
+} // namespace tesseract
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children