From eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 11 Oct 2023 16:41:58 +0300 Subject: [PATCH] gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297) --- Doc/c-api/unicode.rst | 22 ++++ Doc/data/stable_abi.dat | 2 + Doc/whatsnew/3.13.rst | 6 + Include/unicodeobject.h | 9 ++ Lib/test/test_capi/test_unicode.py | 112 ++++++++++++++++++ Lib/test/test_stable_abi_ctypes.py | 2 + ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 1 + Misc/stable_abi.toml | 4 + Modules/_testcapi/unicode.c | 44 +++++++ Objects/unicodeobject.c | 76 ++++++++++++ PC/python3dll.c | 2 + 11 files changed, 280 insertions(+) create mode 100644 Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 2a2cb1b8c458e7..5ab9f1cab23ef8 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. +.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size) + + Compare a Unicode object with a char buffer which is interpreted as + being UTF-8 or ASCII encoded and return true (``1``) if they are equal, + or false (``0``) otherwise. + If the Unicode object contains surrogate characters or + the C string is not valid UTF-8, false (``0``) is returned. + + This function does not raise exceptions. + + .. versionadded:: 3.13 + + +.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) + + Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string* + length using :c:func:`!strlen`. + If the Unicode object contains null characters, false (``0``) is returned. + + .. versionadded:: 3.13 + + .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 5bccd5edf586f4..6ec9c907254b04 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -755,6 +755,8 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,, function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, +function,PyUnicode_EqualToUTF8,3.13,, +function,PyUnicode_EqualToUTF8AndSize,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 8b67c2737cde5d..bbc1fecf4964d8 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1024,6 +1024,12 @@ New Features functions on Python 3.11 and 3.12. (Contributed by Victor Stinner in :gh:`107073`.) +* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` + functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded + string and return true (``1``) if they are equal, or false (``0``) otherwise. + These functions do not raise exceptions. + (Contributed by Serhiy Storchaka in :gh:`110289`.) + * Add :c:func:`PyThreadState_GetUnchecked()` function: similar to :c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error if it is NULL. The caller is responsible to check if the result is NULL. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f00277787122aa..dee00715b3c51d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( const char *right /* ASCII-encoded string */ ); +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 +/* Compare a Unicode object with UTF-8 encoded C string. + Return 1 if they are equal, or 0 otherwise. + This function does not raise exceptions. */ + +PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); +PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t); +#endif + /* Rich compare two strings and return one of the following: - NULL in case an exception was raised diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 622ee8993907fa..a73e669dda7ddc 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1297,6 +1297,118 @@ def test_comparewithasciistring(self): # CRASHES comparewithasciistring([], b'abc') # CRASHES comparewithasciistring(NULL, b'abc') + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8(self): + # Test PyUnicode_EqualToUTF8() + from _testcapi import unicode_equaltoutf8 as equaltoutf8 + from _testcapi import unicode_asutf8andsize as asutf8andsize + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', + ] + for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) + b = s.encode() + self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8(s2, b), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8(s, b + b'\0'), 1) + self.assertEqual(equaltoutf8(s2, b + b'\0'), 1) + self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) + self.assertEqual(equaltoutf8(s + '\0', b), 0) + self.assertEqual(equaltoutf8(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + + self.assertEqual(equaltoutf8('', b''), 1) + self.assertEqual(equaltoutf8('', b'\0'), 1) + + # embedded null chars/bytes + self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) + self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) + self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8andsize(self): + # Test PyUnicode_EqualToUTF8AndSize() + from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize + from _testcapi import unicode_asutf8andsize as asutf8andsize + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', + ] + for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) + b = s.encode() + self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8andsize(s2, b), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0) + # Not null-terminated, + self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1) + self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) + + self.assertEqual(equaltoutf8andsize('', b''), 1) + self.assertEqual(equaltoutf8andsize('', b'\0'), 0) + self.assertEqual(equaltoutf8andsize('', b'x', 0), 1) + + # embedded null chars/bytes + self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1) + self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8andsize('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) + + def check_not_equal_encoding(text, encoding): + self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0) + self.assertNotEqual(text.encode(encoding), text.encode("utf8")) + + # Strings encoded to other encodings are not equal to expected UTF8-encoding string + check_not_equal_encoding('Stéphane', 'latin1') + check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters + check_not_equal_encoding('北京市', 'gbk') + + # CRASHES equaltoutf8andsize('abc', b'abc', -1) + # CRASHES equaltoutf8andsize(b'abc', b'abc') + # CRASHES equaltoutf8andsize([], b'abc') + # CRASHES equaltoutf8andsize(NULL, b'abc') + # CRASHES equaltoutf8andsize('abc', NULL) + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_richcompare(self): diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 4691687ed9d391..e06f9cabf4366b 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -770,6 +770,8 @@ def test_windows_feature_macros(self): "PyUnicode_DecodeUnicodeEscape", "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", + "PyUnicode_EqualToUTF8", + "PyUnicode_EqualToUTF8AndSize", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst new file mode 100644 index 00000000000000..9028e35130d50c --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -0,0 +1 @@ +Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 469fd27b622344..9d66b92eb8edf0 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2462,3 +2462,7 @@ added = '3.13' [function.Py_IsFinalizing] added = '3.13' +[function.PyUnicode_EqualToUTF8] + added = '3.13' +[function.PyUnicode_EqualToUTF8AndSize] + added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 232b2ad543fca0..d52d88a65d86fc 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args) return PyLong_FromLong(result); } +/* Test PyUnicode_EqualToUTF8() */ +static PyObject * +unicode_equaltoutf8(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + int result; + + if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) { + return NULL; + } + + NULLABLE(left); + result = PyUnicode_EqualToUTF8(left, right); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + +/* Test PyUnicode_EqualToUTF8AndSize() */ +static PyObject * +unicode_equaltoutf8andsize(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + Py_ssize_t size = -100; + int result; + + if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) { + return NULL; + } + + NULLABLE(left); + if (size == -100) { + size = right_len; + } + result = PyUnicode_EqualToUTF8AndSize(left, right, size); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + /* Test PyUnicode_RichCompare() */ static PyObject * unicode_richcompare(PyObject *self, PyObject *args) @@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = { {"unicode_replace", unicode_replace, METH_VARARGS}, {"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, + {"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS}, + {"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 49981a1f881c21..33cbc987d43282 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } } +int +PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) +{ + return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str)); +} + +int +PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size) +{ + assert(_PyUnicode_CHECK(unicode)); + assert(str); + + if (PyUnicode_IS_ASCII(unicode)) { + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + return size == len && + memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; + } + if (PyUnicode_UTF8(unicode) != NULL) { + Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode); + return size == len && + memcmp(PyUnicode_UTF8(unicode), str, len) == 0; + } + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) { + return 0; + } + const unsigned char *s = (const unsigned char *)str; + const unsigned char *ends = s + (size_t)size; + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + /* Compare Unicode string and UTF-8 string */ + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch < 0x80) { + if (ends == s || s[0] != ch) { + return 0; + } + s += 1; + } + else if (ch < 0x800) { + if ((ends - s) < 2 || + s[0] != (0xc0 | (ch >> 6)) || + s[1] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 2; + } + else if (ch < 0x10000) { + if (Py_UNICODE_IS_SURROGATE(ch) || + (ends - s) < 3 || + s[0] != (0xe0 | (ch >> 12)) || + s[1] != (0x80 | ((ch >> 6) & 0x3f)) || + s[2] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 3; + } + else { + assert(ch <= MAX_UNICODE); + if ((ends - s) < 4 || + s[0] != (0xf0 | (ch >> 18)) || + s[1] != (0x80 | ((ch >> 12) & 0x3f)) || + s[2] != (0x80 | ((ch >> 6) & 0x3f)) || + s[3] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 4; + } + } + return s == ends; +} + int _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 785d6886f39f6d..7ee11746770442 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -689,6 +689,8 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful) EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) +EXPORT_FUNC(PyUnicode_EqualToUTF8) +EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format)