From bc1007d5caa027e033180b095c5d4ecbbcafaa11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Wed, 27 Nov 2019 19:27:05 +0100 Subject: [PATCH 1/3] Refactor String#to_utf16 optimizing for ascii-only --- src/string/utf16.cr | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/string/utf16.cr b/src/string/utf16.cr index 7ccdd08123f5..a54bf131d53e 100644 --- a/src/string/utf16.cr +++ b/src/string/utf16.cr @@ -12,22 +12,30 @@ class String # "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] # ``` def to_utf16 : Slice(UInt16) - size = 0 - each_char do |char| - size += char.ord < 0x10000 ? 1 : 2 + if ascii_only? + # size == bytesize, so each char fits in one UInt16 + u16_size = bytesize + else + # size < bytesize, so we need to add one UInt16 for each character that + # is two UInt16 wide. + u16_size = 0 + each_char do |char| + u16_size += char.ord < 0x1_0000 ? 1 : 2 + end end - slice = Slice(UInt16).new(size + 1) + # Allocate one extra character for trailing null + slice = Slice(UInt16).new(u16_size + 1) i = 0 each_char do |char| ord = char.ord - if ord <= 0xd800 || (0xe000 <= ord < 0x10000) + if ord <= 0xd800 || (0xe000 <= ord < 0x1_0000) # One UInt16 is enough slice[i] = ord.to_u16 - elsif ord >= 0x10000 + elsif ord >= 0x1_0000 # Needs surrogate pair - ord -= 0x10000 + ord -= 0x1_0000 slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits i += 1 slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits @@ -41,7 +49,7 @@ class String # Append null byte slice[i] = 0_u16 - slice[0, size] + slice[0, u16_size] end # Decodes the given *slice* UTF-16 sequence into a String. From 6a36aad18a386997f633d763934c38eed663102c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Wed, 27 Nov 2019 21:46:22 +0100 Subject: [PATCH 2/3] fixup! Refactor String#to_utf16 optimizing for ascii-only --- src/string/utf16.cr | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/string/utf16.cr b/src/string/utf16.cr index a54bf131d53e..676f2bdf9074 100644 --- a/src/string/utf16.cr +++ b/src/string/utf16.cr @@ -14,14 +14,14 @@ class String def to_utf16 : Slice(UInt16) if ascii_only? # size == bytesize, so each char fits in one UInt16 - u16_size = bytesize - else - # size < bytesize, so we need to add one UInt16 for each character that - # is two UInt16 wide. - u16_size = 0 - each_char do |char| - u16_size += char.ord < 0x1_0000 ? 1 : 2 - end + return to_slice.map &.to_u16 + end + + # size < bytesize, so we need to count the number of characters that are + # two UInt16 wide. + u16_size = 0 + each_char do |char| + u16_size += char.ord < 0x1_0000 ? 1 : 2 end # Allocate one extra character for trailing null From e0047ad38b1da590c034618dafc3ed6e1730c878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Thu, 28 Nov 2019 01:45:11 +0100 Subject: [PATCH 3/3] fixup! Refactor String#to_utf16 optimizing for ascii-only --- spec/std/string/utf16_spec.cr | 10 ++++++++++ src/string/utf16.cr | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr index 3144e4a943f1..ef1b78be1bcf 100644 --- a/spec/std/string/utf16_spec.cr +++ b/spec/std/string/utf16_spec.cr @@ -2,24 +2,34 @@ require "spec" describe "String UTF16" do describe "to_utf16" do + it "in the range U+0000..U+FF" do + encoded = "\u{0}hello\u{ff}".to_utf16 + encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 + end + it "in the range U+0000..U+D7FF" do encoded = "\u{0}hello\u{d7ff}".to_utf16 encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end it "in the range U+E000 to U+FFFF" do encoded = "\u{e000}\u{ffff}".to_utf16 encoded.should eq(Slice[0xe000_u16, 0xffff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end it "in the range U+10000..U+10FFFF" do encoded = "\u{10000}\u{10FFFF}".to_utf16 encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end it "in the range U+D800..U+DFFF" do encoded = "\u{D800}\u{DFFF}".to_utf16 encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16]) + encoded.unsafe_fetch(encoded.size).should eq 0_u16 end end diff --git a/src/string/utf16.cr b/src/string/utf16.cr index 676f2bdf9074..f4f247b96156 100644 --- a/src/string/utf16.cr +++ b/src/string/utf16.cr @@ -14,7 +14,17 @@ class String def to_utf16 : Slice(UInt16) if ascii_only? # size == bytesize, so each char fits in one UInt16 - return to_slice.map &.to_u16 + + # This is essentially equivalent to `to_slice.map(&.to_u16)` but also makes + # sure to allocate a null byte after the string. + slice = Slice(UInt16).new(bytesize + 1) do |i| + if i == bytesize + 0_u16 + else + unsafe_byte_at(i).to_u16 + end + end + return slice[0, bytesize] end # size < bytesize, so we need to count the number of characters that are