diff --git a/CHANGES/1288.misc.rst b/CHANGES/1288.misc.rst new file mode 100644 index 000000000..3793e37e0 --- /dev/null +++ b/CHANGES/1288.misc.rst @@ -0,0 +1 @@ +Improved performance of the quoter when all characters are safe -- by :user:`bdraco`. diff --git a/yarl/_quoting_c.pyx b/yarl/_quoting_c.pyx index 6ac44fdf3..9e9f44ec4 100644 --- a/yarl/_quoting_c.pyx +++ b/yarl/_quoting_c.pyx @@ -2,7 +2,14 @@ from cpython.exc cimport PyErr_NoMemory from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc -from cpython.unicode cimport PyUnicode_DecodeASCII, PyUnicode_DecodeUTF8Stateful +from cpython.unicode cimport ( + PyUnicode_DATA, + PyUnicode_DecodeASCII, + PyUnicode_DecodeUTF8Stateful, + PyUnicode_GET_LENGTH, + PyUnicode_KIND, + PyUnicode_READ, +) from libc.stdint cimport uint8_t, uint64_t from libc.string cimport memcpy, memset @@ -20,14 +27,14 @@ cdef str QS = '+&=;' DEF BUF_SIZE = 8 * 1024 # 8KiB cdef char BUFFER[BUF_SIZE] -cdef inline Py_UCS4 _to_hex(uint8_t v): +cdef inline Py_UCS4 _to_hex(uint8_t v) noexcept: if v < 10: return (v+0x30) # ord('0') == 0x30 else: return (v+0x41-10) # ord('A') == 0x41 -cdef inline int _from_hex(Py_UCS4 v): +cdef inline int _from_hex(Py_UCS4 v) noexcept: if '0' <= v <= '9': return (v) - 0x30 # ord('0') == 0x30 elif 'A' <= v <= 'F': @@ -38,11 +45,11 @@ cdef inline int _from_hex(Py_UCS4 v): return -1 -cdef inline int _is_lower_hex(Py_UCS4 v): +cdef inline int _is_lower_hex(Py_UCS4 v) noexcept: return 'a' <= v <= 'f' -cdef inline Py_UCS4 _restore_ch(Py_UCS4 d1, Py_UCS4 d2): +cdef inline Py_UCS4 _restore_ch(Py_UCS4 d1, Py_UCS4 d2) noexcept: cdef int digit1 = _from_hex(d1) if digit1 < 0: return -1 @@ -56,11 +63,11 @@ cdef uint8_t ALLOWED_TABLE[16] cdef uint8_t ALLOWED_NOTQS_TABLE[16] -cdef inline bint bit_at(uint8_t array[], uint64_t ch): +cdef inline bint bit_at(uint8_t array[], uint64_t ch) noexcept: return array[ch >> 3] & (1 << (ch & 7)) -cdef inline void set_bit(uint8_t array[], uint64_t ch): +cdef inline void set_bit(uint8_t array[], uint64_t ch) noexcept: array[ch >> 3] |= (1 << (ch & 7)) @@ -202,7 +209,6 @@ cdef class _Quoter: set_bit(self._protected_table, ch) def __call__(self, val): - cdef Writer writer if val is None: return None if type(val) is not str: @@ -211,23 +217,55 @@ cdef class _Quoter: val = str(val) else: raise TypeError("Argument should be str") + return self._do_quote_or_skip(val) + + cdef str _do_quote_or_skip(self, str val): + cdef Py_UCS4 ch + cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val) + cdef Py_ssize_t idx = length + cdef bint must_quote = 0 + cdef Writer writer + cdef int kind = PyUnicode_KIND(val) + cdef const void *data = PyUnicode_DATA(val) + + # If everything in the string is in the safe + # table and all ASCII, we can skip quoting + while idx: + idx -= 1 + ch = PyUnicode_READ(kind, data, idx) + if ch >= 128 or not bit_at(self._safe_table, ch): + must_quote = 1 + break + + if not must_quote: + return val + _init_writer(&writer) try: - return self._do_quote(val, &writer) + return self._do_quote(val, length, kind, data, &writer) finally: _release_writer(&writer) - cdef str _do_quote(self, str val, Writer *writer): + cdef str _do_quote( + self, + str val, + Py_ssize_t length, + int kind, + const void *data, + Writer *writer + ): cdef Py_UCS4 ch cdef int changed - cdef int idx = 0 - cdef int length = len(val) + cdef Py_ssize_t idx = 0 while idx < length: - ch = val[idx] + ch = PyUnicode_READ(kind, data, idx) idx += 1 if ch == '%' and self._requote and idx <= length - 2: - ch = _restore_ch(val[idx], val[idx + 1]) + ch = _restore_ch( + PyUnicode_READ(kind, data, idx), + PyUnicode_READ(kind, data, idx + 1) + ) if ch != -1: idx += 2 if ch < 128: @@ -241,8 +279,8 @@ cdef class _Quoter: raise continue - changed = (_is_lower_hex(val[idx - 2]) or - _is_lower_hex(val[idx - 1])) + changed = (_is_lower_hex(PyUnicode_READ(kind, data, idx - 2)) or + _is_lower_hex(PyUnicode_READ(kind, data, idx - 1))) if _write_pct(writer, ch, changed) < 0: raise continue