add some comments

methane · Nov 19, 2024 · 72ed21d · 72ed21d
1 parent 092c189
commit 72ed21d
Showing 1 changed file with 37 additions and 3 deletions.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -785,6 +785,22 @@ unicode_result(PyObject *unicode)
 static PyObject*
 unicode_result_unchanged(PyObject *unicode)
 {
+
+/* Check if a Unicode string is a palindrome */
+static int
+unicode_is_palindrome(PyObject *unicode)
+{
+    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
+    int kind = PyUnicode_KIND(unicode);
+    const void *data = PyUnicode_DATA(unicode);
+
+    for (Py_ssize_t i = 0; i < length / 2; i++) {
+        if (PyUnicode_READ(kind, data, i) != PyUnicode_READ(kind, data, length - i - 1)) {
+            return 0;
+        }
+    }
+    return 1;
+}
     if (PyUnicode_CheckExact(unicode)) {
         return Py_NewRef(unicode);
     }
@@ -5061,6 +5077,14 @@ load_unaligned(const unsigned char *p, size_t size)
 }
 #endif
 
+/*
+ * Find the first non-ASCII character in a byte sequence.
+ *
+ * This function scans a range of bytes from `start` to `end` and returns the
+ * index of the first byte that is not an ASCII character (i.e., has the most
+ * significant bit set). If all characters in the range are ASCII, it returns
+ * `end - start`.
+ */
 static Py_ssize_t
 find_first_nonascii(const unsigned char *start, const unsigned char *end)
 {
@@ -5122,18 +5146,23 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
 #endif
 }
 
-static inline int scalar_utf8_start_char(unsigned int ch)
+static inline int
+scalar_utf8_start_char(unsigned int ch)
 {
     // 0xxxxxxx or 11xxxxxx are first byte.
     return (~ch >> 7 | ch >> 6) & 1;
 }
 
-static inline size_t vector_utf8_start_chars(size_t v)
+static inline size_t
+vector_utf8_start_chars(size_t v)
 {
     return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
 }
 
-static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
+
+// Count the number of UTF-8 code points in a given byte sequence.
+static Py_ssize_t
+utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
 {
     Py_ssize_t len = 0;
 
@@ -5377,6 +5406,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
     // otherwise: check the input and decide the maxchr and maxsize to reduce
     // reallocation and copy.
     if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
+        // we only calculate the number of codepoints and don't determine the exact maxchr.
+        // This is because writing fast and portable SIMD code to find maxchr is difficult.
+        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
+        // means that it is no longer necessary to allocate several times the required amount
+        // of memory.
         maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
         if (ch < 0xc4) { // latin1
             maxchr = 0xff;