diff mupdf-source/thirdparty/tesseract/src/ccutil/unicharcompress.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccutil/unicharcompress.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,249 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unicharcompress.h
+// Description: Unicode re-encoding using a sequence of smaller numbers in
+//              place of a single large code for CJK, similarly for Indic,
+//              and dissection of ligatures for other scripts.
+// Author:      Ray Smith
+//
+// (C) Copyright 2015, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
+#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
+
+#include <unordered_map>
+#include <vector>
+#include "serialis.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+// Trivial class to hold the code for a recoded unichar-id.
+class RecodedCharID {
+public:
+  // The maximum length of a code.
+  static const int kMaxCodeLen = 9;
+
+  RecodedCharID() : self_normalized_(1), length_(0) {
+    memset(code_, 0, sizeof(code_));
+  }
+  void Truncate(int length) {
+    length_ = length;
+  }
+  // Sets the code value at the given index in the code.
+  void Set(int index, int value) {
+    code_[index] = value;
+    if (length_ <= index) {
+      length_ = index + 1;
+    }
+  }
+  // Shorthand for setting codes of length 3, as all Hangul and Han codes are
+  // length 3.
+  void Set3(int code0, int code1, int code2) {
+    length_ = 3;
+    code_[0] = code0;
+    code_[1] = code1;
+    code_[2] = code2;
+  }
+  bool empty() const {
+    return length_ == 0;
+  }
+  // Accessors
+  int length() const {
+    return length_;
+  }
+  int operator()(int index) const {
+    return code_[index];
+  }
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(TFile *fp) const {
+    return fp->Serialize(&self_normalized_) && fp->Serialize(&length_) &&
+           fp->Serialize(&code_[0], length_);
+  }
+  // Reads from the given file. Returns false in case of error.
+  bool DeSerialize(TFile *fp) {
+    return fp->DeSerialize(&self_normalized_) && fp->DeSerialize(&length_) &&
+           fp->DeSerialize(&code_[0], length_);
+  }
+  bool operator==(const RecodedCharID &other) const {
+    if (length_ != other.length_) {
+      return false;
+    }
+    for (int i = 0; i < length_; ++i) {
+      if (code_[i] != other.code_[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+  // Hash functor for RecodedCharID.
+  struct RecodedCharIDHash {
+    uint64_t operator()(const RecodedCharID &code) const {
+      uint64_t result = 0;
+      for (int i = 0; i < code.length_; ++i) {
+        result ^= static_cast<uint64_t>(code(i)) << (7 * i);
+      }
+      return result;
+    }
+  };
+
+private:
+  // True if this code is self-normalizing, ie is the master entry for indices
+  // that map to the same code. Has boolean value, but int8_t for serialization.
+  int8_t self_normalized_;
+  // The number of elements in use in code_;
+  int32_t length_;
+  // The re-encoded form of the unichar-id to which this RecodedCharID relates.
+  int32_t code_[kMaxCodeLen];
+};
+
+// Class holds a "compression" of a unicharset to simplify the learning problem
+// for a neural-network-based classifier.
+// Objectives:
+// 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
+//          a sequence of 3 codes with much fewer values.
+//          This is achieved using the Jamo coding for Hangul and the Unicode
+//          Radical-Stroke-index for Han.
+// 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
+//            as the unicode sequence (but coded in a more compact space).
+// 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
+//               and not significantly distinct shapes (quotes) together, ie
+//               represent the fi ligature as the f-i pair, and fold u+2019 and
+//               friends all onto ascii single '
+// 4 The null character and mapping to target activations:
+//    To save horizontal coding space, the compressed codes are generally mapped
+//    to target network activations without intervening null characters, BUT
+//    in the case of ligatures, such as ff, null characters have to be included
+//    so existence of repeated codes is detected at codebook-building time, and
+//    null characters are embedded directly into the codes, so the rest of the
+//    system doesn't need to worry about the problem (much). There is still an
+//    effect on the range of ways in which the target activations can be
+//    generated.
+//
+// The computed code values are compact (no unused values), and, for CJK,
+// unique (each code position uses a disjoint set of values from each other code
+// position). For non-CJK, the same code value CAN be used in multiple
+// positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
+// is the same code as is used for the single f.
+class TESS_API UnicharCompress {
+public:
+  UnicharCompress();
+  UnicharCompress(const UnicharCompress &src);
+  ~UnicharCompress();
+  UnicharCompress &operator=(const UnicharCompress &src);
+
+  // The 1st Hangul unicode.
+  static const int kFirstHangul = 0xac00;
+  // The number of Hangul unicodes.
+  static const int kNumHangul = 11172;
+  // The number of Jamos for each of the 3 parts of a Hangul character, being
+  // the Leading consonant, Vowel and Trailing consonant.
+  static const int kLCount = 19;
+  static const int kVCount = 21;
+  static const int kTCount = 28;
+
+  // Computes the encoding for the given unicharset. It is a requirement that
+  // the file training/langdata/radical-stroke.txt have been read into the
+  // input string radical_stroke_table.
+  // Returns false if the encoding cannot be constructed.
+  bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table);
+  // Sets up an encoder that doesn't change the unichars at all, so it just
+  // passes them through unchanged.
+  void SetupPassThrough(const UNICHARSET &unicharset);
+  // Sets up an encoder directly using the given encoding vector, which maps
+  // unichar_ids to the given codes.
+  void SetupDirect(const std::vector<RecodedCharID> &codes);
+
+  // Returns the number of different values that can be used in a code, ie
+  // 1 + the maximum value that will ever be used by an RecodedCharID code in
+  // any position in its array.
+  int code_range() const {
+    return code_range_;
+  }
+
+  // Encodes a single unichar_id. Returns the length of the code, (or zero if
+  // invalid input), and the encoding itself in code.
+  int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const;
+  // Decodes code, returning the original unichar-id, or
+  // INVALID_UNICHAR_ID if the input is invalid.
+  int DecodeUnichar(const RecodedCharID &code) const;
+  // Returns true if the given code is a valid start or single code.
+  bool IsValidFirstCode(int code) const {
+    return is_valid_start_[code];
+  }
+  // Returns a list of valid non-final next codes for a given prefix code,
+  // which may be empty.
+  const std::vector<int> *GetNextCodes(const RecodedCharID &code) const {
+    auto it = next_codes_.find(code);
+    return it == next_codes_.end() ? nullptr : it->second;
+  }
+  // Returns a list of valid final codes for a given prefix code, which may
+  // be empty.
+  const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const {
+    auto it = final_codes_.find(code);
+    return it == final_codes_.end() ? nullptr : it->second;
+  }
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(TFile *fp) const;
+  // Reads from the given file. Returns false in case of error.
+
+  bool DeSerialize(TFile *fp);
+
+  // Returns a string containing a text file that describes the encoding thus:
+  // <index>[,<index>]*<tab><UTF8-str><newline>
+  // In words, a comma-separated list of one or more indices, followed by a tab
+  // and the UTF-8 string that the code represents per line. Most simple scripts
+  // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
+  // and the Indic scripts will contain a many-to-many mapping.
+  // See the class comment above for details.
+  std::string GetEncodingAsString(const UNICHARSET &unicharset) const;
+
+  // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
+  // Note that the returned values are 0-based indices, NOT unicode Jamo.
+  // Returns false if the input is not in the Hangul unicode range.
+  static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing);
+
+private:
+  // Renumbers codes to eliminate unused values.
+  void DefragmentCodeValues(int encoded_null);
+  // Computes the value of code_range_ from the encoder_.
+  void ComputeCodeRange();
+  // Initializes the decoding hash_map from the encoder_ array.
+  void SetupDecoder();
+  // Frees allocated memory.
+  void Cleanup();
+
+  // The encoder that maps a unichar-id to a sequence of small codes.
+  // encoder_ is the only part that is serialized. The rest is computed on load.
+  std::vector<RecodedCharID> encoder_;
+  // Decoder converts the output of encoder back to a unichar-id.
+  std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_;
+  // True if the index is a valid single or start code.
+  std::vector<bool> is_valid_start_;
+  // Maps a prefix code to a list of valid next codes.
+  // The map owns the vectors.
+  std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
+      next_codes_;
+  // Maps a prefix code to a list of valid final codes.
+  // The map owns the vectors.
+  std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
+      final_codes_;
+  // Max of any value in encoder_ + 1.
+  int code_range_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_