Some cleanup/improvements to wtf8/utf16 functions

- Remove the `len` parameter from `utf16_` functions, since string.sub can be used to pass a different length to those functions - Remove the allocation of an intermediate NUL-terminated UTF-16 string in `luv_utf16_length_as_wtf8`/`luv_utf16_to_wtf8`, since a NUL terminator is not needed when specifying the length (it's only needed when using `-1` as the length) - Add some more test cases
luvit · Oct 11, 2024 · ac1486e · ac1486e
1 parent 72f39b2
commit ac1486e
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 36 deletions.
diff --git a/docs.md b/docs.md
@@ -4058,40 +4058,37 @@ These string utilities are needed internally for dealing with Windows, and are e
 **Notes**:
 
 1. New in luv version 1.49.0.
-2. A UTF-16 character is 2 bytes, and a UTF-8 character is 1 byte.
-3. Luv use Lua style string, which means that all inputs and return values (UTF-8 or UTF-16 strings) not include NUL terminated.
+2. See [the WTF-8 spec](https://simonsapin.github.io/wtf-8/) for information about WTF-8.
+3. Luv uses Lua-style strings, which means that all inputs and return values (UTF-8 or UTF-16 strings) do not include a NUL terminator.
 
-### `uv.utf16_length_as_wtf8()`
+### `uv.utf16_length_as_wtf8(utf16)`
 
-Get the length of a UTF-16 (or UCS-2) string `utf16` value after converting it to WTF-8.
+Get the length (in bytes) of a UTF-16 (or UCS-2) string `utf16` value after converting it to WTF-8.
 
 **Parameters:**
 - `utf16`: `string`
-- `len`: `integer` or `nil` (default: `#utf16/2`)
 
 **Returns:** `integer`
 
-### `uv.utf16_to_wtf8()`
+### `uv.utf16_to_wtf8(utf16)`
 
-Convert UTF-16 (or UCS-2) string `utf16` to UTF-8 string. The `len` count (in characters)
-gives the length of utf16.
+Convert UTF-16 (or UCS-2) string `utf16` to WTF-8 string.
 
 **Parameters:**
 - `utf16`: `string`
-- `len`: `integer` or `nil` (default: `#utf16/2`)
 
 **Returns:** `string`
 
-### `uv.wtf8_length_as_utf16()`
+### `uv.wtf8_length_as_utf16(wtf8)`
 
-Get the length in characters of a WTF-8 `wtf8` value after converting it to UTF-16 (or UCS-2).
+Get the length (in UTF-16 code units) of a WTF-8 `wtf8` value after converting it to UTF-16 (or UCS-2). Note: The number of bytes needed for a UTF-16 (or UCS-2) string is `<number of code units> * 2`.
 
 **Parameters:**
 - `wtf8`: `string`
 
 **Returns:** `integer`
 
-### `uv.wtf8_to_utf16()`
+### `uv.wtf8_to_utf16(wtf8)`
 
 Convert WTF-8 string in `wtf8` to UTF-16 (or UCS-2) string.
 

diff --git a/src/misc.c b/src/misc.c
@@ -788,16 +788,10 @@ static int luv_clock_gettime(lua_State* L) {
 static int luv_utf16_length_as_wtf8(lua_State* L) {
   size_t sz;
   const uint16_t *utf16 = (const uint16_t *)luaL_checklstring(L, 1, &sz);
-  ssize_t utf16_len = luaL_optinteger(L, 2, sz/2);
-  /* pad NUL terminator */
-  uint16_t *ws = malloc(sz+2);
-  if (ws== NULL) return luaL_error(L, "failed to allocate %zu bytes", sz + 2);
-  memcpy(ws, utf16, sz);
-  ws[sz/2] = 0;
-  sz = uv_utf16_length_as_wtf8(ws, utf16_len+1);
-  /* The returned length not include NUL terminator, we use Lua style string */
-  lua_pushinteger(L, sz - 1);
-  free(ws);
+  ssize_t utf16_len = sz/2;
+  sz = uv_utf16_length_as_wtf8(utf16, utf16_len);
+  /* The returned length includes a NUL terminator, but we use Lua style string */
+  lua_pushinteger(L, sz);
   return 1;
 }
 
@@ -806,25 +800,18 @@ static int luv_utf16_to_wtf8(lua_State *L) {
   size_t sz;
   char *wtf8;
   const uint16_t *utf16 = (const uint16_t *)luaL_checklstring(L, 1, &sz);
-  ssize_t utf16_len = luaL_optinteger(L, 2, sz/2);
-  /* pad NUL terminator */
-  uint16_t *ws = malloc(2*(utf16_len+1));
-  if (ws== NULL) return luaL_error(L, "failed to allocate %zu bytes", 2*(utf16_len+1));
-  memcpy(ws, utf16, 2*utf16_len);
-  ws[utf16_len] = 0;
-  sz = uv_utf16_length_as_wtf8(ws, utf16_len+1);
+  ssize_t utf16_len = sz/2;
+  sz = uv_utf16_length_as_wtf8(utf16, utf16_len);
   wtf8 = malloc(sz + 1);
   if (wtf8 == NULL) return luaL_error(L, "failed to allocate %zu bytes", sz + 1);
-  ret = uv_utf16_to_wtf8(ws, utf16_len+1, &wtf8, &sz);
+  ret = uv_utf16_to_wtf8(utf16, utf16_len, &wtf8, &sz);
   if (ret == 0) {
-    /* The returned string include NUL terminator, we use Lua style string */
-    lua_pushlstring(L, wtf8, sz - 1);
+    lua_pushlstring(L, wtf8, sz);
     ret = 1;
   } else {
     ret = luv_error(L, ret);
   }
   free(wtf8);
-  free(ws);
   return ret;
 }
 
@@ -838,7 +825,7 @@ static int luv_wtf8_length_as_utf16(lua_State *L) {
   s[sz] = '\0';
   ssz = uv_wtf8_length_as_utf16(s);
   free(s);
-  /* The returned length not include NUL terminator, we use Lua style string */
+  /* The returned length should not include NUL terminator, we use Lua style string */
   lua_pushinteger(L, ssz - 1);
   return 1;
 }
@@ -856,7 +843,7 @@ static int luv_wtf8_to_utf16(lua_State *L) {
   utf16 = malloc(ssz * 2);
   if (utf16 == NULL) return luaL_error(L, "failed to allocate %zu bytes", ssz * 2);
   uv_wtf8_to_utf16(s, utf16, ssz);
-  /* The returned string include NUL terminator, we use Lua style string */
+  /* The returned string includes a NUL terminator, but we use Lua style string */
   lua_pushlstring(L, (const char*)utf16, (ssz-1) * 2);
   free(utf16);
   free(s);

diff --git a/tests/test-misc.lua b/tests/test-misc.lua
@@ -219,7 +219,7 @@ return require('lib/tap')(function (test)
     end
   end, "1.45.0")
 
-  test("uv.wtf8 and utf8 conversion", function(print, p, expect, uv)
+  test("uv.wtf8 and utf16 conversion", function(print, p, expect, uv)
     -- default encoding is utf8/wtf8
     local utf8 = string.char(0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87)
     -- The utf8 content is "中文"
@@ -231,4 +231,25 @@ return require('lib/tap')(function (test)
     assert(utf8=='中文', utf8)
   end, "1.49.0")
 
+  test("uv.wtf8<->utf16 unpaired surrogate", function(print, p, expect, uv)
+    -- WTF-8 encoding of the surrogate codepoint U+D83D (surrogate codepoints
+    -- don't have a valid UTF-8 encoding, but can be encoded as WTF-8)
+    local wtf8 = string.char(0xed, 0xa0, 0xbd)
+    local utf16 = uv.wtf8_to_utf16(wtf8)
+    assert(#utf16==2, #utf16)
+    -- U+D83D as little-endian WTF-16
+    assert(utf16==string.char(0x3d, 0xd8))
+    assert(uv.utf16_length_as_wtf8(utf16) == #wtf8, uv.utf16_length_as_wtf8(utf16))
+    assert(uv.wtf8_length_as_utf16(wtf8) == 1, uv.wtf8_length_as_utf16(wtf8))
+    local roundtrip_wtf8 = uv.utf16_to_wtf8(utf16)
+    assert(roundtrip_wtf8==wtf8, roundtrip_wtf8)
+  end, "1.49.0")
+
+  test("uv.wtf8<->utf16 empty strings", function(print, p, expect, uv)
+    assert(uv.wtf8_to_utf16("") == "")
+    assert(uv.utf16_to_wtf8("") == "")
+    assert(uv.wtf8_length_as_utf16("") == 0)
+    assert(uv.utf16_length_as_wtf8("") == 0)
+  end, "1.49.0")
+
 end)