Skip to content

Commit

Permalink
Some cleanup/improvements to wtf8/utf16 functions
Browse files Browse the repository at this point in the history
- Remove the `len` parameter from `utf16_` functions, since string.sub can be used to pass a different length to those functions
- Remove the allocation of an intermediate NUL-terminated UTF-16 string in `luv_utf16_length_as_wtf8`/`luv_utf16_to_wtf8`, since a NUL terminator is not needed when specifying the length (it's only needed when using `-1` as the length)
- Add some more test cases
  • Loading branch information
squeek502 authored and zhaozg committed Oct 11, 2024
1 parent 72f39b2 commit ac1486e
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 36 deletions.
21 changes: 9 additions & 12 deletions docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -4058,40 +4058,37 @@ These string utilities are needed internally for dealing with Windows, and are e
**Notes**:

1. New in luv version 1.49.0.
2. A UTF-16 character is 2 bytes, and a UTF-8 character is 1 byte.
3. Luv use Lua style string, which means that all inputs and return values (UTF-8 or UTF-16 strings) not include NUL terminated.
2. See [the WTF-8 spec](https://simonsapin.github.io/wtf-8/) for information about WTF-8.
3. Luv uses Lua-style strings, which means that all inputs and return values (UTF-8 or UTF-16 strings) do not include a NUL terminator.

### `uv.utf16_length_as_wtf8()`
### `uv.utf16_length_as_wtf8(utf16)`

Get the length of a UTF-16 (or UCS-2) string `utf16` value after converting it to WTF-8.
Get the length (in bytes) of a UTF-16 (or UCS-2) string `utf16` value after converting it to WTF-8.

**Parameters:**
- `utf16`: `string`
- `len`: `integer` or `nil` (default: `#utf16/2`)

**Returns:** `integer`

### `uv.utf16_to_wtf8()`
### `uv.utf16_to_wtf8(utf16)`

Convert UTF-16 (or UCS-2) string `utf16` to UTF-8 string. The `len` count (in characters)
gives the length of utf16.
Convert UTF-16 (or UCS-2) string `utf16` to WTF-8 string.

**Parameters:**
- `utf16`: `string`
- `len`: `integer` or `nil` (default: `#utf16/2`)

**Returns:** `string`

### `uv.wtf8_length_as_utf16()`
### `uv.wtf8_length_as_utf16(wtf8)`

Get the length in characters of a WTF-8 `wtf8` value after converting it to UTF-16 (or UCS-2).
Get the length (in UTF-16 code units) of a WTF-8 `wtf8` value after converting it to UTF-16 (or UCS-2). Note: The number of bytes needed for a UTF-16 (or UCS-2) string is `<number of code units> * 2`.

**Parameters:**
- `wtf8`: `string`

**Returns:** `integer`

### `uv.wtf8_to_utf16()`
### `uv.wtf8_to_utf16(wtf8)`

Convert WTF-8 string in `wtf8` to UTF-16 (or UCS-2) string.

Expand Down
33 changes: 10 additions & 23 deletions src/misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -788,16 +788,10 @@ static int luv_clock_gettime(lua_State* L) {
static int luv_utf16_length_as_wtf8(lua_State* L) {
size_t sz;
const uint16_t *utf16 = (const uint16_t *)luaL_checklstring(L, 1, &sz);
ssize_t utf16_len = luaL_optinteger(L, 2, sz/2);
/* pad NUL terminator */
uint16_t *ws = malloc(sz+2);
if (ws== NULL) return luaL_error(L, "failed to allocate %zu bytes", sz + 2);
memcpy(ws, utf16, sz);
ws[sz/2] = 0;
sz = uv_utf16_length_as_wtf8(ws, utf16_len+1);
/* The returned length not include NUL terminator, we use Lua style string */
lua_pushinteger(L, sz - 1);
free(ws);
ssize_t utf16_len = sz/2;
sz = uv_utf16_length_as_wtf8(utf16, utf16_len);
/* The returned length includes a NUL terminator, but we use Lua style string */
lua_pushinteger(L, sz);
return 1;
}

Expand All @@ -806,25 +800,18 @@ static int luv_utf16_to_wtf8(lua_State *L) {
size_t sz;
char *wtf8;
const uint16_t *utf16 = (const uint16_t *)luaL_checklstring(L, 1, &sz);
ssize_t utf16_len = luaL_optinteger(L, 2, sz/2);
/* pad NUL terminator */
uint16_t *ws = malloc(2*(utf16_len+1));
if (ws== NULL) return luaL_error(L, "failed to allocate %zu bytes", 2*(utf16_len+1));
memcpy(ws, utf16, 2*utf16_len);
ws[utf16_len] = 0;
sz = uv_utf16_length_as_wtf8(ws, utf16_len+1);
ssize_t utf16_len = sz/2;
sz = uv_utf16_length_as_wtf8(utf16, utf16_len);
wtf8 = malloc(sz + 1);
if (wtf8 == NULL) return luaL_error(L, "failed to allocate %zu bytes", sz + 1);
ret = uv_utf16_to_wtf8(ws, utf16_len+1, &wtf8, &sz);
ret = uv_utf16_to_wtf8(utf16, utf16_len, &wtf8, &sz);
if (ret == 0) {
/* The returned string include NUL terminator, we use Lua style string */
lua_pushlstring(L, wtf8, sz - 1);
lua_pushlstring(L, wtf8, sz);
ret = 1;
} else {
ret = luv_error(L, ret);
}
free(wtf8);
free(ws);
return ret;
}

Expand All @@ -838,7 +825,7 @@ static int luv_wtf8_length_as_utf16(lua_State *L) {
s[sz] = '\0';
ssz = uv_wtf8_length_as_utf16(s);
free(s);
/* The returned length not include NUL terminator, we use Lua style string */
/* The returned length should not include NUL terminator, we use Lua style string */
lua_pushinteger(L, ssz - 1);
return 1;
}
Expand All @@ -856,7 +843,7 @@ static int luv_wtf8_to_utf16(lua_State *L) {
utf16 = malloc(ssz * 2);
if (utf16 == NULL) return luaL_error(L, "failed to allocate %zu bytes", ssz * 2);
uv_wtf8_to_utf16(s, utf16, ssz);
/* The returned string include NUL terminator, we use Lua style string */
/* The returned string includes a NUL terminator, but we use Lua style string */
lua_pushlstring(L, (const char*)utf16, (ssz-1) * 2);
free(utf16);
free(s);
Expand Down
23 changes: 22 additions & 1 deletion tests/test-misc.lua
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ return require('lib/tap')(function (test)
end
end, "1.45.0")

test("uv.wtf8 and utf8 conversion", function(print, p, expect, uv)
test("uv.wtf8 and utf16 conversion", function(print, p, expect, uv)
-- default encoding is utf8/wtf8
local utf8 = string.char(0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87)
-- The utf8 content is "中文"
Expand All @@ -231,4 +231,25 @@ return require('lib/tap')(function (test)
assert(utf8=='中文', utf8)
end, "1.49.0")

test("uv.wtf8<->utf16 unpaired surrogate", function(print, p, expect, uv)
-- WTF-8 encoding of the surrogate codepoint U+D83D (surrogate codepoints
-- don't have a valid UTF-8 encoding, but can be encoded as WTF-8)
local wtf8 = string.char(0xed, 0xa0, 0xbd)
local utf16 = uv.wtf8_to_utf16(wtf8)
assert(#utf16==2, #utf16)
-- U+D83D as little-endian WTF-16
assert(utf16==string.char(0x3d, 0xd8))
assert(uv.utf16_length_as_wtf8(utf16) == #wtf8, uv.utf16_length_as_wtf8(utf16))
assert(uv.wtf8_length_as_utf16(wtf8) == 1, uv.wtf8_length_as_utf16(wtf8))
local roundtrip_wtf8 = uv.utf16_to_wtf8(utf16)
assert(roundtrip_wtf8==wtf8, roundtrip_wtf8)
end, "1.49.0")

test("uv.wtf8<->utf16 empty strings", function(print, p, expect, uv)
assert(uv.wtf8_to_utf16("") == "")
assert(uv.utf16_to_wtf8("") == "")
assert(uv.wtf8_length_as_utf16("") == 0)
assert(uv.utf16_length_as_wtf8("") == 0)
end, "1.49.0")

end)

0 comments on commit ac1486e

Please sign in to comment.