# HG changeset patch # User Franz Glasner # Date 1641081610 -3600 # Node ID 84657447ab39b26e82be32e57a37593bc85c4c70 # Parent 325008573bc60ad1daa0d39501f15e4f544931a9 FIX: Properly raise a UnicodeEncodeError from C diff -r 325008573bc6 -r 84657447ab39 configmix/_speedups.c --- a/configmix/_speedups.c Sat Jan 01 21:38:46 2022 +0100 +++ b/configmix/_speedups.c Sun Jan 02 01:00:10 2022 +0100 @@ -62,6 +62,37 @@ #if defined(Py_LIMITED_API) +static +void +_raise_utf8_encode_error(PyObject *s, + Py_ssize_t start, Py_ssize_t end, + const char *reason) +{ + /* + * See also: https://docs.python.org/3/c-api/exceptions.html#unicode-exception-objects + */ + PyObject *errobj = PyObject_CallFunction( + PyExc_UnicodeEncodeError, + "sOnns", + "utf-8", + s, + start, + end, + reason); + + if (errobj == NULL) { + /* cannot do anything here */ + return; + } + /* Make PyExc_UnicodeEncodeError owned because PyErr_Restore steals */ + //Py_INCREF(PyExc_UnicodeEncodeError); + //PyErr_Restore(PyExc_UnicodeEncodeError, errobj, NULL); + + PyErr_SetObject(PyExc_UnicodeEncodeError, errobj); + Py_DECREF(errobj); +} + + /* * Copyright 2001-2004 Unicode, Inc. * @@ -105,6 +136,8 @@ Py_ssize_t _convert_ucs4_to_utf8( Py_UCS4 ch, + PyObject *ch_obj, /* for error messages: the string where ch comes from */ + Py_ssize_t ch_obj_end, /* effective length of ch_obj (error reporting) */ unsigned char *targetStart, unsigned char *targetEnd, int strict) { @@ -117,8 +150,10 @@ if (strict) { /* UTF-16 surrogate values are illegal */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - PyErr_SetString(PyExc_UnicodeEncodeError, - "surrogate values not allowed"); + _raise_utf8_encode_error( + ch_obj, + 1, ch_obj_end, + "surrogate values are illegal in UCS4"); return -1; } } @@ -135,15 +170,19 @@ } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; } else { - PyErr_SetString(PyExc_UnicodeEncodeError, - "max Unicode codepoint value exceeded"); + _raise_utf8_encode_error( + ch_obj, + 1, ch_obj_end, + "max Unicode codepoint value exceeded"); return -1; } target += bytesToWrite; if (target > targetEnd) { - PyErr_SetString(PyExc_UnicodeEncodeError, - "target exhausted"); + _raise_utf8_encode_error( + ch_obj, + 1, ch_obj_end, + "temporary target buffer exhausted"); return -1; } switch (bytesToWrite) { /* note: everything falls through. */ @@ -156,6 +195,11 @@ } +/* + * End of Copyright 2001-2004 Unicode, Inc. + */ + + static PyObject * _hex2string(PyObject *s, Py_ssize_t end) @@ -169,7 +213,7 @@ return NULL; /* Replace the combination PyUniode_New/PyUnicode_WriteChar */ - buf_bytes = _convert_ucs4_to_utf8(c, buf, &(buf[6]), 1); + buf_bytes = _convert_ucs4_to_utf8(c, s, end+1, buf, &(buf[6]), 1); if (buf_bytes < 0) { return NULL; }