changeset 549:84657447ab39

FIX: Properly raise a UnicodeEncodeError from C
author Franz Glasner <fzglas.hg@dom66.de>
date Sun, 02 Jan 2022 01:00:10 +0100
parents 325008573bc6
children 79db28e879f8
files configmix/_speedups.c
diffstat 1 files changed, 51 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/configmix/_speedups.c	Sat Jan 01 21:38:46 2022 +0100
+++ b/configmix/_speedups.c	Sun Jan 02 01:00:10 2022 +0100
@@ -62,6 +62,37 @@
 
 #if defined(Py_LIMITED_API)
 
+static
+void
+_raise_utf8_encode_error(PyObject *s,
+                         Py_ssize_t start, Py_ssize_t end,
+                         const char *reason)
+{
+    /*
+     * See also: https://docs.python.org/3/c-api/exceptions.html#unicode-exception-objects
+     */
+    PyObject *errobj = PyObject_CallFunction(
+        PyExc_UnicodeEncodeError,
+        "sOnns",
+        "utf-8",
+        s,
+        start,
+        end,
+        reason);
+
+    if (errobj == NULL) {
+        /* cannot do anything here */
+        return;
+    }
+    /* Make PyExc_UnicodeEncodeError owned because PyErr_Restore steals */
+    //Py_INCREF(PyExc_UnicodeEncodeError);
+    //PyErr_Restore(PyExc_UnicodeEncodeError, errobj, NULL);
+
+    PyErr_SetObject(PyExc_UnicodeEncodeError, errobj);
+    Py_DECREF(errobj);
+}
+
+
 /*
  * Copyright 2001-2004 Unicode, Inc.
  *
@@ -105,6 +136,8 @@
 Py_ssize_t
 _convert_ucs4_to_utf8(
     Py_UCS4 ch,
+    PyObject *ch_obj,  /* for error messages: the string where ch comes from */
+    Py_ssize_t ch_obj_end,  /* effective length of ch_obj (error reporting) */
     unsigned char *targetStart, unsigned char *targetEnd,
     int strict)
 {
@@ -117,8 +150,10 @@
     if (strict) {
         /* UTF-16 surrogate values are illegal */
         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
-            PyErr_SetString(PyExc_UnicodeEncodeError,
-                            "surrogate values not allowed");
+            _raise_utf8_encode_error(
+                ch_obj,
+                1, ch_obj_end,
+                "surrogate values are illegal in UCS4");
             return -1;
         }
     }
@@ -135,15 +170,19 @@
     } else if (ch <= UNI_MAX_LEGAL_UTF32) {
         bytesToWrite = 4;
     } else {
-        PyErr_SetString(PyExc_UnicodeEncodeError,
-                        "max Unicode codepoint value exceeded");
+        _raise_utf8_encode_error(
+            ch_obj,
+            1, ch_obj_end,
+            "max Unicode codepoint value exceeded");
         return -1;
     }
 
     target += bytesToWrite;
     if (target > targetEnd) {
-        PyErr_SetString(PyExc_UnicodeEncodeError,
-                        "target exhausted");
+        _raise_utf8_encode_error(
+            ch_obj,
+            1, ch_obj_end,
+            "temporary target buffer exhausted");
         return -1;
     }
     switch (bytesToWrite) { /* note: everything falls through. */
@@ -156,6 +195,11 @@
 }
 
 
+/*
+ * End of Copyright 2001-2004 Unicode, Inc.
+ */
+
+
 static
 PyObject *
 _hex2string(PyObject *s, Py_ssize_t end)
@@ -169,7 +213,7 @@
         return NULL;
 
     /* Replace the combination PyUniode_New/PyUnicode_WriteChar */
-    buf_bytes = _convert_ucs4_to_utf8(c, buf, &(buf[6]), 1);
+    buf_bytes = _convert_ucs4_to_utf8(c, s, end+1, buf, &(buf[6]), 1);
     if (buf_bytes < 0) {
         return NULL;
     }