diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr index 3144e4a943f1..ef1b78be1bcf 100644 --- a/spec/std/string/utf16_spec.cr +++ b/spec/std/string/utf16_spec.cr @@ -2,24 +2,34 @@ require "spec" describe "String UTF16" do describe "to_utf16" do + it "in the range U+0000..U+FF" do + encoded = "\u{0}hello\u{ff}".to_utf16 + encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 + end + it "in the range U+0000..U+D7FF" do encoded = "\u{0}hello\u{d7ff}".to_utf16 encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end it "in the range U+E000 to U+FFFF" do encoded = "\u{e000}\u{ffff}".to_utf16 encoded.should eq(Slice[0xe000_u16, 0xffff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end it "in the range U+10000..U+10FFFF" do encoded = "\u{10000}\u{10FFFF}".to_utf16 encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end it "in the range U+D800..U+DFFF" do encoded = "\u{D800}\u{DFFF}".to_utf16 encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end end diff --git a/src/string/utf16.cr b/src/string/utf16.cr index 7ccdd08123f5..f4f247b96156 100644 --- a/src/string/utf16.cr +++ b/src/string/utf16.cr @@ -12,22 +12,40 @@ class String # "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] # ``` def to_utf16 : Slice(UInt16) - size = 0 + if ascii_only? + # size == bytesize, so each char fits in one UInt16 + + # This is essentially equivalent to `to_slice.map(&.to_u16)` but also makes + # sure to allocate a null byte after the string. + slice = Slice(UInt16).new(bytesize + 1) do |i| + if i == bytesize + 0_u16 + else + unsafe_byte_at(i).to_u16 + end + end + return slice[0, bytesize] + end + + # size < bytesize, so we need to count the number of characters that are + # two UInt16 wide. + u16_size = 0 each_char do |char| - size += char.ord < 0x10000 ? 1 : 2 + u16_size += char.ord < 0x1_0000 ? 1 : 2 end - slice = Slice(UInt16).new(size + 1) + # Allocate one extra character for trailing null + slice = Slice(UInt16).new(u16_size + 1) i = 0 each_char do |char| ord = char.ord - if ord <= 0xd800 || (0xe000 <= ord < 0x10000) + if ord <= 0xd800 || (0xe000 <= ord < 0x1_0000) # One UInt16 is enough slice[i] = ord.to_u16 - elsif ord >= 0x10000 + elsif ord >= 0x1_0000 # Needs surrogate pair - ord -= 0x10000 + ord -= 0x1_0000 slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits i += 1 slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits @@ -41,7 +59,7 @@ class String # Append null byte slice[i] = 0_u16 - slice[0, size] + slice[0, u16_size] end # Decodes the given *slice* UTF-16 sequence into a String.