diff mupdf-source/thirdparty/tesseract/src/ccutil/unicharcompress.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccutil/unicharcompress.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,450 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unicharcompress.cpp
+// Description: Unicode re-encoding using a sequence of smaller numbers in
+//              place of a single large code for CJK, similarly for Indic,
+//              and dissection of ligatures for other scripts.
+// Author:      Ray Smith
+//
+// (C) Copyright 2015, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "unicharcompress.h"
+#include <algorithm>
+#include <memory>
+#include "tprintf.h"
+
+namespace tesseract {
+
+// String used to represent the null_id in direct_set.
+static const char *kNullChar = "<nul>";
+// Radix to make unique values from the stored radical codes.
+const int kRadicalRadix = 29;
+
+// "Hash" function for const std::vector<int> computes the sum of elements.
+// Build a unique number for each code sequence that we can use as the index in
+// a hash map of ints instead of trying to hash the vectors.
+static int RadicalPreHash(const std::vector<int> &rs) {
+  size_t result = 0;
+  for (int radical : rs) {
+    result *= kRadicalRadix;
+    result += radical;
+  }
+  return result;
+}
+
+// A hash map to convert unicodes to radical encoding.
+using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
+// A hash map to count occurrences of each radical encoding.
+using RSCounts = std::unordered_map<int, int>;
+
+static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
+  if (radical_data_line.empty() || (radical_data_line)[0] == '#') {
+    return true;
+  }
+  std::vector<std::string> entries = split(radical_data_line, ' ');
+  if (entries.size() < 2) {
+    return false;
+  }
+  char *end = nullptr;
+  int unicode = strtol(&entries[0][0], &end, 10);
+  if (*end != '\0') {
+    return false;
+  }
+  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
+  for (size_t i = 1; i < entries.size(); ++i) {
+    int radical = strtol(&entries[i][0], &end, 10);
+    if (*end != '\0') {
+      return false;
+    }
+    radicals->push_back(radical);
+  }
+  (*radical_map)[unicode] = std::move(radicals);
+  return true;
+}
+
+// Helper function builds the RSMap from the radical-stroke file, which has
+// already been read into a string. Returns false on error.
+// The radical_stroke_table is non-const because it gets split and the caller
+// is unlikely to want to use it again.
+static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
+  std::vector<std::string> lines = split(radical_data, '\n');
+  for (unsigned i = 0; i < lines.size(); ++i) {
+    if (!DecodeRadicalLine(lines[i], radical_map)) {
+      tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+UnicharCompress::UnicharCompress() : code_range_(0) {}
+UnicharCompress::UnicharCompress(const UnicharCompress &src) {
+  *this = src;
+}
+UnicharCompress::~UnicharCompress() {
+  Cleanup();
+}
+UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {
+  Cleanup();
+  encoder_ = src.encoder_;
+  code_range_ = src.code_range_;
+  SetupDecoder();
+  return *this;
+}
+
+// Computes the encoding for the given unicharset. It is a requirement that
+// the file training/langdata/radical-stroke.txt have been read into the
+// input string radical_stroke_table.
+// Returns false if the encoding cannot be constructed.
+bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
+                                      std::string *radical_stroke_table) {
+  RSMap radical_map;
+  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
+    return false;
+  }
+  encoder_.clear();
+  UNICHARSET direct_set;
+  // To avoid unused codes, clear the special codes from the direct_set.
+  direct_set.clear();
+  // Always keep space as 0;
+  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
+  // Null char is next if we have one.
+  if (null_id >= 0) {
+    direct_set.unichar_insert(kNullChar);
+  }
+  RSCounts radical_counts;
+  // In the initial map, codes [0, unicharset.size()) are
+  // reserved for non-han/hangul sequences of 1 or more unicodes.
+  int hangul_offset = unicharset.size();
+  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
+  const int kTotalJamos = kLCount + kVCount + kTCount;
+  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
+  // to measure the number of radicals and strokes, initially we use the same
+  // code range for all 3 Han code positions, and fix them after.
+  int han_offset = hangul_offset + kTotalJamos;
+  for (unsigned u = 0; u <= unicharset.size(); ++u) {
+    // We special-case allow null_id to be equal to unicharset.size() in case
+    // there is no space in unicharset for it.
+    if (u == unicharset.size() && static_cast<int>(u) != null_id) {
+      break; // Finished
+    }
+    RecodedCharID code;
+    // Convert to unicodes.
+    std::vector<char32> unicodes;
+    std::string cleaned;
+    if (u < unicharset.size()) {
+      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
+    }
+    if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
+      // Check single unicodes for Hangul/Han and encode if so.
+      int unicode = unicodes[0];
+      int leading, vowel, trailing;
+      auto it = radical_map.find(unicode);
+      if (it != radical_map.end()) {
+        // This is Han. Use the radical codes directly.
+        int num_radicals = it->second->size();
+        for (int c = 0; c < num_radicals; ++c) {
+          code.Set(c, han_offset + (*it->second)[c]);
+        }
+        int pre_hash = RadicalPreHash(*it->second);
+        int num_samples = radical_counts[pre_hash]++;
+        if (num_samples > 0) {
+          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
+        }
+      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
+        // This is Hangul. Since we know the exact size of each part at compile
+        // time, it gets the bottom set of codes.
+        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
+                  trailing + kLCount + kVCount + hangul_offset);
+      }
+    }
+    // If the code is still empty, it wasn't Han or Hangul.
+    if (code.empty()) {
+      // Special cases.
+      if (u == UNICHAR_SPACE) {
+        code.Set(0, 0); // Space.
+      } else if (static_cast<int>(u) == null_id ||
+                 (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
+        code.Set(0, direct_set.unichar_to_id(kNullChar));
+      } else {
+        // Add the direct_set unichar-ids of the unicodes in sequence to the
+        // code.
+        for (int uni : unicodes) {
+          int position = code.length();
+          if (position >= RecodedCharID::kMaxCodeLen) {
+            tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
+            return false;
+          }
+          UNICHAR unichar(uni);
+          char *utf8 = unichar.utf8_str();
+          if (!direct_set.contains_unichar(utf8)) {
+            direct_set.unichar_insert(utf8);
+          }
+          code.Set(position, direct_set.unichar_to_id(utf8));
+          delete[] utf8;
+          if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
+            // Code space got bigger!
+            tprintf("Code space expanded from original unicharset!!\n");
+            return false;
+          }
+        }
+      }
+    }
+    encoder_.push_back(code);
+  }
+  // Now renumber Han to make all codes unique. We already added han_offset to
+  // all Han. Now separate out the radical, stroke, and count codes for Han.
+  int code_offset = 0;
+  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
+    int max_offset = 0;
+    for (unsigned u = 0; u < unicharset.size(); ++u) {
+      RecodedCharID *code = &encoder_[u];
+      if (code->length() <= i) {
+        continue;
+      }
+      max_offset = std::max(max_offset, (*code)(i)-han_offset);
+      code->Set(i, (*code)(i) + code_offset);
+    }
+    if (max_offset == 0) {
+      break;
+    }
+    code_offset += max_offset + 1;
+  }
+  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
+  SetupDecoder();
+  return true;
+}
+
+// Sets up an encoder that doesn't change the unichars at all, so it just
+// passes them through unchanged.
+void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {
+  std::vector<RecodedCharID> codes;
+  for (unsigned u = 0; u < unicharset.size(); ++u) {
+    RecodedCharID code;
+    code.Set(0, u);
+    codes.push_back(code);
+  }
+  if (!unicharset.has_special_codes()) {
+    RecodedCharID code;
+    code.Set(0, unicharset.size());
+    codes.push_back(code);
+  }
+  SetupDirect(codes);
+}
+
+// Sets up an encoder directly using the given encoding vector, which maps
+// unichar_ids to the given codes.
+void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) {
+  encoder_ = codes;
+  ComputeCodeRange();
+  SetupDecoder();
+}
+
+// Renumbers codes to eliminate unused values.
+void UnicharCompress::DefragmentCodeValues(int encoded_null) {
+  // There may not be any Hangul, but even if there is, it is possible that not
+  // all codes are used. Likewise with the Han encoding, it is possible that not
+  // all numbers of strokes are used.
+  ComputeCodeRange();
+  std::vector<int> offsets(code_range_);
+  // Find which codes are used
+  for (auto &code : encoder_) {
+    for (int i = 0; i < code.length(); ++i) {
+      offsets[code(i)] = 1;
+    }
+  }
+  // Compute offsets based on code use.
+  int offset = 0;
+  for (unsigned i = 0; i < offsets.size(); ++i) {
+    // If not used, decrement everything above here.
+    // We are moving encoded_null to the end, so it is not "used".
+    if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) {
+      --offset;
+    } else {
+      offsets[i] = offset;
+    }
+  }
+  if (encoded_null >= 0) {
+    // The encoded_null is moving to the end, for the benefit of TensorFlow,
+    // which is offsets.size() + offsets.back().
+    offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
+  }
+  // Now apply the offsets.
+  for (auto &c : encoder_) {
+    RecodedCharID *code = &c;
+    for (int i = 0; i < code->length(); ++i) {
+      int value = (*code)(i);
+      code->Set(i, value + offsets[value]);
+    }
+  }
+  ComputeCodeRange();
+}
+
+// Encodes a single unichar_id. Returns the length of the code, or zero if
+// invalid input, and the encoding itself
+int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {
+  if (unichar_id >= encoder_.size()) {
+    return 0;
+  }
+  *code = encoder_[unichar_id];
+  return code->length();
+}
+
+// Decodes code, returning the original unichar-id, or
+// INVALID_UNICHAR_ID if the input is invalid.
+int UnicharCompress::DecodeUnichar(const RecodedCharID &code) const {
+  int len = code.length();
+  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {
+    return INVALID_UNICHAR_ID;
+  }
+  auto it = decoder_.find(code);
+  if (it == decoder_.end()) {
+    return INVALID_UNICHAR_ID;
+  }
+  return it->second;
+}
+
+// Writes to the given file. Returns false in case of error.
+bool UnicharCompress::Serialize(TFile *fp) const {
+  return fp->Serialize(encoder_);
+}
+
+// Reads from the given file. Returns false in case of error.
+bool UnicharCompress::DeSerialize(TFile *fp) {
+  if (!fp->DeSerialize(encoder_)) {
+    return false;
+  }
+  ComputeCodeRange();
+  SetupDecoder();
+  return true;
+}
+
+// Returns a string containing a text file that describes the encoding thus:
+// <index>[,<index>]*<tab><UTF8-str><newline>
+// In words, a comma-separated list of one or more indices, followed by a tab
+// and the UTF-8 string that the code represents per line. Most simple scripts
+// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
+// and the Indic scripts will contain a many-to-many mapping.
+// See the class comment above for details.
+std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
+  std::string encoding;
+  for (unsigned c = 0; c < encoder_.size(); ++c) {
+    const RecodedCharID &code = encoder_[c];
+    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
+      // Don't show the duplicate entry.
+      continue;
+    }
+    encoding += std::to_string(code(0));
+    for (int i = 1; i < code.length(); ++i) {
+      encoding += "," + std::to_string(code(i));
+    }
+    encoding += "\t";
+    if (c >= unicharset.size() ||
+        (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
+      encoding += kNullChar;
+    } else {
+      encoding += unicharset.id_to_unichar(c);
+    }
+    encoding += "\n";
+  }
+  return encoding;
+}
+
+// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
+// Note that the returned values are 0-based indices, NOT unicode Jamo.
+// Returns false if the input is not in the Hangul unicode range.
+/* static */
+bool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) {
+  if (unicode < kFirstHangul) {
+    return false;
+  }
+  int offset = unicode - kFirstHangul;
+  if (offset >= kNumHangul) {
+    return false;
+  }
+  const int kNCount = kVCount * kTCount;
+  *leading = offset / kNCount;
+  *vowel = (offset % kNCount) / kTCount;
+  *trailing = offset % kTCount;
+  return true;
+}
+
+// Computes the value of code_range_ from the encoder_.
+void UnicharCompress::ComputeCodeRange() {
+  code_range_ = -1;
+  for (auto &code : encoder_) {
+    for (int i = 0; i < code.length(); ++i) {
+      if (code(i) > code_range_) {
+        code_range_ = code(i);
+      }
+    }
+  }
+  ++code_range_;
+}
+
+// Initializes the decoding hash_map from the encoding array.
+void UnicharCompress::SetupDecoder() {
+  Cleanup();
+  is_valid_start_.clear();
+  is_valid_start_.resize(code_range_);
+  for (unsigned c = 0; c < encoder_.size(); ++c) {
+    const RecodedCharID &code = encoder_[c];
+    decoder_[code] = c;
+    is_valid_start_[code(0)] = true;
+    RecodedCharID prefix = code;
+    int len = code.length() - 1;
+    prefix.Truncate(len);
+    auto final_it = final_codes_.find(prefix);
+    if (final_it == final_codes_.end()) {
+      auto *code_list = new std::vector<int>;
+      code_list->push_back(code(len));
+      final_codes_[prefix] = code_list;
+      while (--len >= 0) {
+        prefix.Truncate(len);
+        auto next_it = next_codes_.find(prefix);
+        if (next_it == next_codes_.end()) {
+          auto *code_list = new std::vector<int>;
+          code_list->push_back(code(len));
+          next_codes_[prefix] = code_list;
+        } else {
+          // We still have to search the list as we may get here via multiple
+          // lengths of code.
+          if (!contains(*next_it->second, code(len))) {
+            next_it->second->push_back(code(len));
+          }
+          break; // This prefix has been processed.
+        }
+      }
+    } else {
+      if (!contains(*final_it->second, code(len))) {
+        final_it->second->push_back(code(len));
+      }
+    }
+  }
+}
+
+// Frees allocated memory.
+void UnicharCompress::Cleanup() {
+  decoder_.clear();
+  is_valid_start_.clear();
+  for (auto &next_code : next_codes_) {
+    delete next_code.second;
+  }
+  for (auto &final_code : final_codes_) {
+    delete final_code.second;
+  }
+  next_codes_.clear();
+  final_codes_.clear();
+}
+
+} // namespace tesseract.