Skip to content

Commit

Permalink
Refactor String#to_utf16 optimizing for ascii-only (#8526)
Browse files Browse the repository at this point in the history
* Refactor String#to_utf16 optimizing for ascii-only

* fixup! Refactor String#to_utf16 optimizing for ascii-only

* fixup! Refactor String#to_utf16 optimizing for ascii-only
  • Loading branch information
straight-shoota authored and asterite committed Dec 6, 2019
1 parent 019a320 commit 7fd3863
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 7 deletions.
10 changes: 10 additions & 0 deletions spec/std/string/utf16_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,34 @@ require "spec"

describe "String UTF16" do
describe "to_utf16" do
it "in the range U+0000..U+FF" do
encoded = "\u{0}hello\u{ff}".to_utf16
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+0000..U+D7FF" do
encoded = "\u{0}hello\u{d7ff}".to_utf16
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+E000 to U+FFFF" do
encoded = "\u{e000}\u{ffff}".to_utf16
encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+10000..U+10FFFF" do
encoded = "\u{10000}\u{10FFFF}".to_utf16
encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end

it "in the range U+D800..U+DFFF" do
encoded = "\u{D800}\u{DFFF}".to_utf16
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end
end

Expand Down
32 changes: 25 additions & 7 deletions src/string/utf16.cr
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,40 @@ class String
# "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
# ```
def to_utf16 : Slice(UInt16)
size = 0
if ascii_only?
# size == bytesize, so each char fits in one UInt16

# This is essentially equivalent to `to_slice.map(&.to_u16)` but also makes
# sure to allocate a null byte after the string.
slice = Slice(UInt16).new(bytesize + 1) do |i|
if i == bytesize
0_u16
else
unsafe_byte_at(i).to_u16
end
end
return slice[0, bytesize]
end

# size < bytesize, so we need to count the number of characters that are
# two UInt16 wide.
u16_size = 0
each_char do |char|
size += char.ord < 0x10000 ? 1 : 2
u16_size += char.ord < 0x1_0000 ? 1 : 2
end

slice = Slice(UInt16).new(size + 1)
# Allocate one extra character for trailing null
slice = Slice(UInt16).new(u16_size + 1)

i = 0
each_char do |char|
ord = char.ord
if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
if ord <= 0xd800 || (0xe000 <= ord < 0x1_0000)
# One UInt16 is enough
slice[i] = ord.to_u16
elsif ord >= 0x10000
elsif ord >= 0x1_0000
# Needs surrogate pair
ord -= 0x10000
ord -= 0x1_0000
slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits
i += 1
slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits
Expand All @@ -41,7 +59,7 @@ class String
# Append null byte
slice[i] = 0_u16

slice[0, size]
slice[0, u16_size]
end

# Decodes the given *slice* UTF-16 sequence into a String.
Expand Down

0 comments on commit 7fd3863

Please sign in to comment.