Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor String#to_utf16 optimizing for ascii-only #8526

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions spec/std/string/utf16_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,34 @@ require "spec"

describe "String UTF16" do
describe "to_utf16" do
it "in the range U+0000..U+FF" do
encoded = "\u{0}hello\u{ff}".to_utf16
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+0000..U+D7FF" do
encoded = "\u{0}hello\u{d7ff}".to_utf16
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+E000 to U+FFFF" do
encoded = "\u{e000}\u{ffff}".to_utf16
encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+10000..U+10FFFF" do
encoded = "\u{10000}\u{10FFFF}".to_utf16
encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+D800..U+DFFF" do
encoded = "\u{D800}\u{DFFF}".to_utf16
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end
end

Expand Down
32 changes: 25 additions & 7 deletions src/string/utf16.cr
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,40 @@ class String
# "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
# ```
def to_utf16 : Slice(UInt16)
size = 0
if ascii_only?
# size == bytesize, so each char fits in one UInt16

# This is essentially equivalent to `to_slice.map(&.to_u16)` but also makes
# sure to allocate a null byte after the string.
slice = Slice(UInt16).new(bytesize + 1) do |i|
if i == bytesize
0_u16
else
unsafe_byte_at(i).to_u16
end
end
return slice[0, bytesize]
asterite marked this conversation as resolved.
Show resolved Hide resolved
end

# size < bytesize, so we need to count the number of characters that are
# two UInt16 wide.
u16_size = 0
each_char do |char|
size += char.ord < 0x10000 ? 1 : 2
u16_size += char.ord < 0x1_0000 ? 1 : 2
straight-shoota marked this conversation as resolved.
Show resolved Hide resolved
end

slice = Slice(UInt16).new(size + 1)
# Allocate one extra character for trailing null
slice = Slice(UInt16).new(u16_size + 1)

i = 0
each_char do |char|
ord = char.ord
if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
if ord <= 0xd800 || (0xe000 <= ord < 0x1_0000)
# One UInt16 is enough
slice[i] = ord.to_u16
elsif ord >= 0x10000
elsif ord >= 0x1_0000
# Needs surrogate pair
ord -= 0x10000
ord -= 0x1_0000
slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits
i += 1
slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits
Expand All @@ -41,7 +59,7 @@ class String
# Append null byte
slice[i] = 0_u16

slice[0, size]
slice[0, u16_size]
end

# Decodes the given *slice* UTF-16 sequence into a String.
Expand Down