comparison configmix/_speedups.c @ 549:84657447ab39

FIX: Properly raise a UnicodeEncodeError from C
author Franz Glasner <fzglas.hg@dom66.de>
date Sun, 02 Jan 2022 01:00:10 +0100
parents 1cbe8b0f2b78
children 79db28e879f8
comparison
equal deleted inserted replaced
548:325008573bc6 549:84657447ab39
59 return 0; /* success */ 59 return 0; /* success */
60 } 60 }
61 61
62 62
63 #if defined(Py_LIMITED_API) 63 #if defined(Py_LIMITED_API)
64
65 static
66 void
67 _raise_utf8_encode_error(PyObject *s,
68 Py_ssize_t start, Py_ssize_t end,
69 const char *reason)
70 {
71 /*
72 * See also: https://docs.python.org/3/c-api/exceptions.html#unicode-exception-objects
73 */
74 PyObject *errobj = PyObject_CallFunction(
75 PyExc_UnicodeEncodeError,
76 "sOnns",
77 "utf-8",
78 s,
79 start,
80 end,
81 reason);
82
83 if (errobj == NULL) {
84 /* cannot do anything here */
85 return;
86 }
87 /* Make PyExc_UnicodeEncodeError owned because PyErr_Restore steals */
88 //Py_INCREF(PyExc_UnicodeEncodeError);
89 //PyErr_Restore(PyExc_UnicodeEncodeError, errobj, NULL);
90
91 PyErr_SetObject(PyExc_UnicodeEncodeError, errobj);
92 Py_DECREF(errobj);
93 }
94
64 95
65 /* 96 /*
66 * Copyright 2001-2004 Unicode, Inc. 97 * Copyright 2001-2004 Unicode, Inc.
67 * 98 *
68 * Disclaimer 99 * Disclaimer
103 134
104 static 135 static
105 Py_ssize_t 136 Py_ssize_t
106 _convert_ucs4_to_utf8( 137 _convert_ucs4_to_utf8(
107 Py_UCS4 ch, 138 Py_UCS4 ch,
139 PyObject *ch_obj, /* for error messages: the string where ch comes from */
140 Py_ssize_t ch_obj_end, /* effective length of ch_obj (error reporting) */
108 unsigned char *targetStart, unsigned char *targetEnd, 141 unsigned char *targetStart, unsigned char *targetEnd,
109 int strict) 142 int strict)
110 { 143 {
111 const Py_UCS4 byteMask = 0xBF; 144 const Py_UCS4 byteMask = 0xBF;
112 const Py_UCS4 byteMark = 0x80; 145 const Py_UCS4 byteMark = 0x80;
115 unsigned char *target = targetStart; 148 unsigned char *target = targetStart;
116 149
117 if (strict) { 150 if (strict) {
118 /* UTF-16 surrogate values are illegal */ 151 /* UTF-16 surrogate values are illegal */
119 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 152 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
120 PyErr_SetString(PyExc_UnicodeEncodeError, 153 _raise_utf8_encode_error(
121 "surrogate values not allowed"); 154 ch_obj,
155 1, ch_obj_end,
156 "surrogate values are illegal in UCS4");
122 return -1; 157 return -1;
123 } 158 }
124 } 159 }
125 /* 160 /*
126 * Figure out how many bytes the result will require. Turn any 161 * Figure out how many bytes the result will require. Turn any
133 } else if (ch < (Py_UCS4)0x10000) { 168 } else if (ch < (Py_UCS4)0x10000) {
134 bytesToWrite = 3; 169 bytesToWrite = 3;
135 } else if (ch <= UNI_MAX_LEGAL_UTF32) { 170 } else if (ch <= UNI_MAX_LEGAL_UTF32) {
136 bytesToWrite = 4; 171 bytesToWrite = 4;
137 } else { 172 } else {
138 PyErr_SetString(PyExc_UnicodeEncodeError, 173 _raise_utf8_encode_error(
139 "max Unicode codepoint value exceeded"); 174 ch_obj,
175 1, ch_obj_end,
176 "max Unicode codepoint value exceeded");
140 return -1; 177 return -1;
141 } 178 }
142 179
143 target += bytesToWrite; 180 target += bytesToWrite;
144 if (target > targetEnd) { 181 if (target > targetEnd) {
145 PyErr_SetString(PyExc_UnicodeEncodeError, 182 _raise_utf8_encode_error(
146 "target exhausted"); 183 ch_obj,
184 1, ch_obj_end,
185 "temporary target buffer exhausted");
147 return -1; 186 return -1;
148 } 187 }
149 switch (bytesToWrite) { /* note: everything falls through. */ 188 switch (bytesToWrite) { /* note: everything falls through. */
150 case 4: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6; 189 case 4: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6;
151 case 3: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6; 190 case 3: *--target = (unsigned char)((ch | byteMark) & byteMask); ch >>= 6;
154 } 193 }
155 return bytesToWrite; 194 return bytesToWrite;
156 } 195 }
157 196
158 197
198 /*
199 * End of Copyright 2001-2004 Unicode, Inc.
200 */
201
202
159 static 203 static
160 PyObject * 204 PyObject *
161 _hex2string(PyObject *s, Py_ssize_t end) 205 _hex2string(PyObject *s, Py_ssize_t end)
162 { 206 {
163 Py_UCS4 c; 207 Py_UCS4 c;
167 211
168 if (_hex2ucs4(s, end, &c) != 0) 212 if (_hex2ucs4(s, end, &c) != 0)
169 return NULL; 213 return NULL;
170 214
171 /* Replace the combination PyUniode_New/PyUnicode_WriteChar */ 215 /* Replace the combination PyUniode_New/PyUnicode_WriteChar */
172 buf_bytes = _convert_ucs4_to_utf8(c, buf, &(buf[6]), 1); 216 buf_bytes = _convert_ucs4_to_utf8(c, s, end+1, buf, &(buf[6]), 1);
173 if (buf_bytes < 0) { 217 if (buf_bytes < 0) {
174 return NULL; 218 return NULL;
175 } 219 }
176 u = PyUnicode_FromStringAndSize((const char *)buf, buf_bytes); 220 u = PyUnicode_FromStringAndSize((const char *)buf, buf_bytes);
177 if (u == NULL) { 221 if (u == NULL) {