changeset 545:6501fe0e116c

Build the speedup C-extension against the stable API. This is done by emulating PyUnicode_New()/PyUnicode_WriteChar() with encoding the UCS4 character to UTF-8 and using PyUnicode_FromStringAndSize().
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 01 Jan 2022 20:36:46 +0100
parents db2d108e14e7
children adf65c31f8fc
files configmix/_speedups.c setup.py
diffstat 2 files changed, 125 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/configmix/_speedups.c	Sat Jan 01 18:05:32 2022 +0100
+++ b/configmix/_speedups.c	Sat Jan 01 20:36:46 2022 +0100
@@ -60,6 +60,128 @@
 }
 
 
+#if defined(Py_LIMITED_API)
+
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+#define UNI_MAX_LEGAL_UTF32 (Py_UCS4)0x0010FFFF
+#define UNI_SUR_HIGH_START  (Py_UCS4)0xD800
+#define UNI_SUR_HIGH_END    (Py_UCS4)0xDBFF
+#define UNI_SUR_LOW_START   (Py_UCS4)0xDC00
+#define UNI_SUR_LOW_END     (Py_UCS4)0xDFFF
+
+
+/*
+ * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+ * into the first byte, depending on how many bytes follow.  There are
+ * as many entries in this table as there are UTF-8 sequence types.
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
+ */
+static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+
+static
+Py_ssize_t
+_convert_ucs4_to_utf8(
+    Py_UCS4 ch,
+    unsigned char *targetStart, unsigned char *targetEnd,
+    int strict)
+{
+    const Py_UCS4 byteMask = 0xBF;
+    const Py_UCS4 byteMark = 0x80;
+
+    Py_ssize_t bytesToWrite = 0;
+    unsigned char *target = targetStart;
+
+    if (strict) {
+        /* UTF-16 surrogate values are illegal */
+        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+            PyErr_SetString(PyExc_UnicodeEncodeError,
+                            "surrogate values not allowed");
+            return -1;
+        }
+    }
+    /*
+     * Figure out how many bytes the result will require. Turn any
+     * illegally large UTF32 things (> Plane 17) into replacement chars.
+     */
+    if (ch < (Py_UCS4)0x80) {
+        bytesToWrite = 1;
+    } else if (ch < (Py_UCS4)0x800) {
+        bytesToWrite = 2;
+    } else if (ch < (Py_UCS4)0x10000) {
+        bytesToWrite = 3;
+    } else if (ch <= UNI_MAX_LEGAL_UTF32) {
+        bytesToWrite = 4;
+    } else {
+        PyErr_SetString(PyExc_UnicodeEncodeError,
+                        "max Unicode codepoint value exceeded");
+        return -1;
+    }
+
+    target += bytesToWrite;
+    if (target > targetEnd) {
+        PyErr_SetString(PyExc_UnicodeEncodeError,
+                        "target exhausted");
+        return -1;
+    }
+    switch (bytesToWrite) { /* note: everything falls through. */
+    case 4: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6;
+    case 3: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6;
+    case 2: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6;
+    case 1: *--target = (unsigned char) (ch | firstByteMark[bytesToWrite]);
+    }
+    return bytesToWrite;
+}
+
+
+static
+PyObject *
+_hex2string(PyObject *s, Py_ssize_t end)
+{
+    Py_UCS4 c;
+    unsigned char buf[6];
+    Py_ssize_t buf_bytes;
+    PyObject *u;
+
+    if (_hex2ucs4(s, end, &c) != 0)
+        return NULL;
+
+    /* Replace the combination PyUniode_New/PyUnicode_WriteChar */
+    buf_bytes = _convert_ucs4_to_utf8(c, buf, &(buf[6]), 1);
+    if (buf_bytes < 0) {
+        return NULL;
+    }
+    u = PyUnicode_FromStringAndSize((const char *)buf, buf_bytes);
+    if (u == NULL) {
+        return NULL;
+    }
+    return u;
+}
+
+#else
+
 static
 PyObject *
 _hex2string(PyObject *s, Py_ssize_t end)
@@ -79,6 +201,8 @@
     return u;
 }
 
+#endif /* Py_LIMITED_API */
+
 
 static
 PyObject *
--- a/setup.py	Sat Jan 01 18:05:32 2022 +0100
+++ b/setup.py	Sat Jan 01 20:36:46 2022 +0100
@@ -57,7 +57,7 @@
     and (sys.version_info[0] > 3
          or (sys.version_info[0] == 3 and sys.version_info[1] >= 7))):
 
-    py_limited_api = False
+    py_limited_api = True
 
     if py_limited_api:
         define_macros = [("Py_LIMITED_API", "0x03070000")]