Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/tesseract/src/ccutil/unichar.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccutil/unichar.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,251 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unichar.cpp
+// Description: Unicode character/ligature class.
+// Author:      Ray Smith
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/unichar.h>
+#include "errcode.h"
+#include "tprintf.h"
+
+#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
+
+namespace tesseract {
+
+// Construct from a utf8 string. If len<0 then the string is null terminated.
+// If the string is too long to fit in the UNICHAR then it takes only what
+// will fit. Checks for illegal input and stops at an illegal sequence.
+// The resulting UNICHAR may be empty.
+UNICHAR::UNICHAR(const char *utf8_str, int len) {
+  int total_len = 0;
+  int step = 0;
+  if (len < 0) {
+    for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
+      ;
+    }
+  }
+  for (total_len = 0; total_len < len; total_len += step) {
+    step = utf8_step(utf8_str + total_len);
+    if (total_len + step > UNICHAR_LEN) {
+      break; // Too long.
+    }
+    if (step == 0) {
+      break; // Illegal first byte.
+    }
+    int i;
+    for (i = 1; i < step; ++i) {
+      if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
+        break;
+      }
+    }
+    if (i < step) {
+      break; // Illegal surrogate
+    }
+  }
+  memcpy(chars, utf8_str, total_len);
+  if (total_len < UNICHAR_LEN) {
+    chars[UNICHAR_LEN - 1] = total_len;
+    while (total_len < UNICHAR_LEN - 1) {
+      chars[total_len++] = 0;
+    }
+  }
+}
+
+// Construct from a single UCS4 character. Illegal values are ignored,
+// resulting in an empty UNICHAR.
+UNICHAR::UNICHAR(int unicode) {
+  const int bytemask = 0xBF;
+  const int bytemark = 0x80;
+
+  if (unicode < 0x80) {
+    chars[UNICHAR_LEN - 1] = 1;
+    chars[2] = 0;
+    chars[1] = 0;
+    chars[0] = static_cast<char>(unicode);
+  } else if (unicode < 0x800) {
+    chars[UNICHAR_LEN - 1] = 2;
+    chars[2] = 0;
+    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
+    unicode >>= 6;
+    chars[0] = static_cast<char>(unicode | 0xc0);
+  } else if (unicode < 0x10000) {
+    chars[UNICHAR_LEN - 1] = 3;
+    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
+    unicode >>= 6;
+    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
+    unicode >>= 6;
+    chars[0] = static_cast<char>(unicode | 0xe0);
+  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
+    chars[UNICHAR_LEN - 1] = 4;
+    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
+    unicode >>= 6;
+    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
+    unicode >>= 6;
+    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
+    unicode >>= 6;
+    chars[0] = static_cast<char>(unicode | 0xf0);
+  } else {
+    memset(chars, 0, UNICHAR_LEN);
+  }
+}
+
+// Get the first character as UCS-4.
+int UNICHAR::first_uni() const {
+  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
+  int uni = 0;
+  int len = utf8_step(chars);
+  const char *src = chars;
+
+  switch (len) {
+    default:
+      break;
+    case 4:
+      uni += static_cast<unsigned char>(*src++);
+      uni <<= 6;
+      // Fall through.
+    case 3:
+      uni += static_cast<unsigned char>(*src++);
+      uni <<= 6;
+      // Fall through.
+    case 2:
+      uni += static_cast<unsigned char>(*src++);
+      uni <<= 6;
+      // Fall through.
+    case 1:
+      uni += static_cast<unsigned char>(*src++);
+  }
+  uni -= utf8_offsets[len];
+  return uni;
+}
+
+// Get a terminated UTF8 string: Must delete[] it after use.
+char *UNICHAR::utf8_str() const {
+  int len = utf8_len();
+  char *str = new char[len + 1];
+  memcpy(str, chars, len);
+  str[len] = 0;
+  return str;
+}
+
+// Get the number of bytes in the first character of the given utf8 string.
+int UNICHAR::utf8_step(const char *utf8_str) {
+  static const char utf8_bytes[256] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
+}
+
+UNICHAR::const_iterator &UNICHAR::const_iterator::operator++() {
+  ASSERT_HOST(it_ != nullptr);
+  int step = utf8_step(it_);
+  if (step == 0) {
+    tprintf("ERROR: Illegal UTF8 encountered.\n");
+    for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
+      tprintf("Index %d char = 0x%x\n", i, it_[i]);
+    }
+    step = 1;
+  }
+  it_ += step;
+  return *this;
+}
+
+int UNICHAR::const_iterator::operator*() const {
+  ASSERT_HOST(it_ != nullptr);
+  const int len = utf8_step(it_);
+  if (len == 0) {
+    tprintf("WARNING: Illegal UTF8 encountered\n");
+    return ' ';
+  }
+  UNICHAR uch(it_, len);
+  return uch.first_uni();
+}
+
+int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {
+  ASSERT_HOST(it_ != nullptr);
+  const int len = utf8_step(it_);
+  if (len == 0) {
+    tprintf("WARNING: Illegal UTF8 encountered\n");
+    utf8_output[0] = ' ';
+    return 1;
+  }
+  strncpy(utf8_output, it_, len);
+  return len;
+}
+
+int UNICHAR::const_iterator::utf8_len() const {
+  ASSERT_HOST(it_ != nullptr);
+  const int len = utf8_step(it_);
+  if (len == 0) {
+    tprintf("WARNING: Illegal UTF8 encountered\n");
+    return 1;
+  }
+  return len;
+}
+
+bool UNICHAR::const_iterator::is_legal() const {
+  return utf8_step(it_) > 0;
+}
+
+UNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) {
+  return UNICHAR::const_iterator(utf8_str);
+}
+
+UNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) {
+  return UNICHAR::const_iterator(utf8_str + len);
+}
+
+// Converts a utf-8 string to a vector of unicodes.
+// Returns an empty vector if the input contains invalid UTF-8.
+/* static */
+std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {
+  const int utf8_length = strlen(utf8_str);
+  std::vector<char32> unicodes;
+  unicodes.reserve(utf8_length);
+  const_iterator end_it(end(utf8_str, utf8_length));
+  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
+    if (it.is_legal()) {
+      unicodes.push_back(*it);
+    } else {
+      unicodes.clear();
+      return unicodes;
+    }
+  }
+  return unicodes;
+}
+
+// Returns an empty string if the input contains an invalid unicode.
+std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {
+  std::string utf8_str;
+  for (char32 ch : str32) {
+    UNICHAR uni_ch(ch);
+    int step;
+    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
+      utf8_str.append(uni_ch.utf8(), step);
+    } else {
+      return "";
+    }
+  }
+  return utf8_str;
+}
+
+} // namespace tesseract
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children