From ac1486e29d56eaa899853415fa22d7f746d4122d Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Thu, 10 Oct 2024 20:47:19 -0700 Subject: [PATCH] Some cleanup/improvements to wtf8/utf16 functions - Remove the `len` parameter from `utf16_` functions, since string.sub can be used to pass a different length to those functions - Remove the allocation of an intermediate NUL-terminated UTF-16 string in `luv_utf16_length_as_wtf8`/`luv_utf16_to_wtf8`, since a NUL terminator is not needed when specifying the length (it's only needed when using `-1` as the length) - Add some more test cases --- docs.md | 21 +++++++++------------ src/misc.c | 33 ++++++++++----------------------- tests/test-misc.lua | 23 ++++++++++++++++++++++- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/docs.md b/docs.md index 76c555e9..ef74a499 100644 --- a/docs.md +++ b/docs.md @@ -4058,40 +4058,37 @@ These string utilities are needed internally for dealing with Windows, and are e **Notes**: 1. New in luv version 1.49.0. -2. A UTF-16 character is 2 bytes, and a UTF-8 character is 1 byte. -3. Luv use Lua style string, which means that all inputs and return values (UTF-8 or UTF-16 strings) not include NUL terminated. +2. See [the WTF-8 spec](https://simonsapin.github.io/wtf-8/) for information about WTF-8. +3. Luv uses Lua-style strings, which means that all inputs and return values (UTF-8 or UTF-16 strings) do not include a NUL terminator. -### `uv.utf16_length_as_wtf8()` +### `uv.utf16_length_as_wtf8(utf16)` -Get the length of a UTF-16 (or UCS-2) string `utf16` value after converting it to WTF-8. +Get the length (in bytes) of a UTF-16 (or UCS-2) string `utf16` value after converting it to WTF-8. **Parameters:** - `utf16`: `string` -- `len`: `integer` or `nil` (default: `#utf16/2`) **Returns:** `integer` -### `uv.utf16_to_wtf8()` +### `uv.utf16_to_wtf8(utf16)` -Convert UTF-16 (or UCS-2) string `utf16` to UTF-8 string. The `len` count (in characters) -gives the length of utf16. +Convert UTF-16 (or UCS-2) string `utf16` to WTF-8 string. **Parameters:** - `utf16`: `string` -- `len`: `integer` or `nil` (default: `#utf16/2`) **Returns:** `string` -### `uv.wtf8_length_as_utf16()` +### `uv.wtf8_length_as_utf16(wtf8)` -Get the length in characters of a WTF-8 `wtf8` value after converting it to UTF-16 (or UCS-2). +Get the length (in UTF-16 code units) of a WTF-8 `wtf8` value after converting it to UTF-16 (or UCS-2). Note: The number of bytes needed for a UTF-16 (or UCS-2) string is ` * 2`. **Parameters:** - `wtf8`: `string` **Returns:** `integer` -### `uv.wtf8_to_utf16()` +### `uv.wtf8_to_utf16(wtf8)` Convert WTF-8 string in `wtf8` to UTF-16 (or UCS-2) string. diff --git a/src/misc.c b/src/misc.c index 0d9a804e..cce43f66 100644 --- a/src/misc.c +++ b/src/misc.c @@ -788,16 +788,10 @@ static int luv_clock_gettime(lua_State* L) { static int luv_utf16_length_as_wtf8(lua_State* L) { size_t sz; const uint16_t *utf16 = (const uint16_t *)luaL_checklstring(L, 1, &sz); - ssize_t utf16_len = luaL_optinteger(L, 2, sz/2); - /* pad NUL terminator */ - uint16_t *ws = malloc(sz+2); - if (ws== NULL) return luaL_error(L, "failed to allocate %zu bytes", sz + 2); - memcpy(ws, utf16, sz); - ws[sz/2] = 0; - sz = uv_utf16_length_as_wtf8(ws, utf16_len+1); - /* The returned length not include NUL terminator, we use Lua style string */ - lua_pushinteger(L, sz - 1); - free(ws); + ssize_t utf16_len = sz/2; + sz = uv_utf16_length_as_wtf8(utf16, utf16_len); + /* The returned length includes a NUL terminator, but we use Lua style string */ + lua_pushinteger(L, sz); return 1; } @@ -806,25 +800,18 @@ static int luv_utf16_to_wtf8(lua_State *L) { size_t sz; char *wtf8; const uint16_t *utf16 = (const uint16_t *)luaL_checklstring(L, 1, &sz); - ssize_t utf16_len = luaL_optinteger(L, 2, sz/2); - /* pad NUL terminator */ - uint16_t *ws = malloc(2*(utf16_len+1)); - if (ws== NULL) return luaL_error(L, "failed to allocate %zu bytes", 2*(utf16_len+1)); - memcpy(ws, utf16, 2*utf16_len); - ws[utf16_len] = 0; - sz = uv_utf16_length_as_wtf8(ws, utf16_len+1); + ssize_t utf16_len = sz/2; + sz = uv_utf16_length_as_wtf8(utf16, utf16_len); wtf8 = malloc(sz + 1); if (wtf8 == NULL) return luaL_error(L, "failed to allocate %zu bytes", sz + 1); - ret = uv_utf16_to_wtf8(ws, utf16_len+1, &wtf8, &sz); + ret = uv_utf16_to_wtf8(utf16, utf16_len, &wtf8, &sz); if (ret == 0) { - /* The returned string include NUL terminator, we use Lua style string */ - lua_pushlstring(L, wtf8, sz - 1); + lua_pushlstring(L, wtf8, sz); ret = 1; } else { ret = luv_error(L, ret); } free(wtf8); - free(ws); return ret; } @@ -838,7 +825,7 @@ static int luv_wtf8_length_as_utf16(lua_State *L) { s[sz] = '\0'; ssz = uv_wtf8_length_as_utf16(s); free(s); - /* The returned length not include NUL terminator, we use Lua style string */ + /* The returned length should not include NUL terminator, we use Lua style string */ lua_pushinteger(L, ssz - 1); return 1; } @@ -856,7 +843,7 @@ static int luv_wtf8_to_utf16(lua_State *L) { utf16 = malloc(ssz * 2); if (utf16 == NULL) return luaL_error(L, "failed to allocate %zu bytes", ssz * 2); uv_wtf8_to_utf16(s, utf16, ssz); - /* The returned string include NUL terminator, we use Lua style string */ + /* The returned string includes a NUL terminator, but we use Lua style string */ lua_pushlstring(L, (const char*)utf16, (ssz-1) * 2); free(utf16); free(s); diff --git a/tests/test-misc.lua b/tests/test-misc.lua index 46b5bdcf..890e51ef 100644 --- a/tests/test-misc.lua +++ b/tests/test-misc.lua @@ -219,7 +219,7 @@ return require('lib/tap')(function (test) end end, "1.45.0") - test("uv.wtf8 and utf8 conversion", function(print, p, expect, uv) + test("uv.wtf8 and utf16 conversion", function(print, p, expect, uv) -- default encoding is utf8/wtf8 local utf8 = string.char(0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87) -- The utf8 content is "中文" @@ -231,4 +231,25 @@ return require('lib/tap')(function (test) assert(utf8=='中文', utf8) end, "1.49.0") + test("uv.wtf8<->utf16 unpaired surrogate", function(print, p, expect, uv) + -- WTF-8 encoding of the surrogate codepoint U+D83D (surrogate codepoints + -- don't have a valid UTF-8 encoding, but can be encoded as WTF-8) + local wtf8 = string.char(0xed, 0xa0, 0xbd) + local utf16 = uv.wtf8_to_utf16(wtf8) + assert(#utf16==2, #utf16) + -- U+D83D as little-endian WTF-16 + assert(utf16==string.char(0x3d, 0xd8)) + assert(uv.utf16_length_as_wtf8(utf16) == #wtf8, uv.utf16_length_as_wtf8(utf16)) + assert(uv.wtf8_length_as_utf16(wtf8) == 1, uv.wtf8_length_as_utf16(wtf8)) + local roundtrip_wtf8 = uv.utf16_to_wtf8(utf16) + assert(roundtrip_wtf8==wtf8, roundtrip_wtf8) + end, "1.49.0") + + test("uv.wtf8<->utf16 empty strings", function(print, p, expect, uv) + assert(uv.wtf8_to_utf16("") == "") + assert(uv.utf16_to_wtf8("") == "") + assert(uv.wtf8_length_as_utf16("") == 0) + assert(uv.utf16_length_as_wtf8("") == 0) + end, "1.49.0") + end)