From bc1007d5caa027e033180b095c5d4ecbbcafaa11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20M=C3=BCller?= <straightshoota@gmail.com>
Date: Wed, 27 Nov 2019 19:27:05 +0100
Subject: [PATCH 1/3] Refactor String#to_utf16 optimizing for ascii-only

---
 src/string/utf16.cr | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/string/utf16.cr b/src/string/utf16.cr
index 7ccdd08123f5..a54bf131d53e 100644
--- a/src/string/utf16.cr
+++ b/src/string/utf16.cr
@@ -12,22 +12,30 @@ class String
   # "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
   # ```
   def to_utf16 : Slice(UInt16)
-    size = 0
-    each_char do |char|
-      size += char.ord < 0x10000 ? 1 : 2
+    if ascii_only?
+      # size == bytesize, so each char fits in one UInt16
+      u16_size = bytesize
+    else
+      # size < bytesize, so we need to add one UInt16 for each character that
+      # is two UInt16 wide.
+      u16_size = 0
+      each_char do |char|
+        u16_size += char.ord < 0x1_0000 ? 1 : 2
+      end
     end
 
-    slice = Slice(UInt16).new(size + 1)
+    # Allocate one extra character for trailing null
+    slice = Slice(UInt16).new(u16_size + 1)
 
     i = 0
     each_char do |char|
       ord = char.ord
-      if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
+      if ord <= 0xd800 || (0xe000 <= ord < 0x1_0000)
         # One UInt16 is enough
         slice[i] = ord.to_u16
-      elsif ord >= 0x10000
+      elsif ord >= 0x1_0000
         # Needs surrogate pair
-        ord -= 0x10000
+        ord -= 0x1_0000
         slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits
         i += 1
         slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits
@@ -41,7 +49,7 @@ class String
     # Append null byte
     slice[i] = 0_u16
 
-    slice[0, size]
+    slice[0, u16_size]
   end
 
   # Decodes the given *slice* UTF-16 sequence into a String.

From 6a36aad18a386997f633d763934c38eed663102c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20M=C3=BCller?= <straightshoota@gmail.com>
Date: Wed, 27 Nov 2019 21:46:22 +0100
Subject: [PATCH 2/3] fixup! Refactor String#to_utf16 optimizing for ascii-only

---
 src/string/utf16.cr | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/string/utf16.cr b/src/string/utf16.cr
index a54bf131d53e..676f2bdf9074 100644
--- a/src/string/utf16.cr
+++ b/src/string/utf16.cr
@@ -14,14 +14,14 @@ class String
   def to_utf16 : Slice(UInt16)
     if ascii_only?
       # size == bytesize, so each char fits in one UInt16
-      u16_size = bytesize
-    else
-      # size < bytesize, so we need to add one UInt16 for each character that
-      # is two UInt16 wide.
-      u16_size = 0
-      each_char do |char|
-        u16_size += char.ord < 0x1_0000 ? 1 : 2
-      end
+      return to_slice.map &.to_u16
+    end
+
+    # size < bytesize, so we need to count the number of characters that are
+    # two UInt16 wide.
+    u16_size = 0
+    each_char do |char|
+      u16_size += char.ord < 0x1_0000 ? 1 : 2
     end
 
     # Allocate one extra character for trailing null

From e0047ad38b1da590c034618dafc3ed6e1730c878 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20M=C3=BCller?= <straightshoota@gmail.com>
Date: Thu, 28 Nov 2019 01:45:11 +0100
Subject: [PATCH 3/3] fixup! Refactor String#to_utf16 optimizing for ascii-only

---
 spec/std/string/utf16_spec.cr | 10 ++++++++++
 src/string/utf16.cr           | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr
index 3144e4a943f1..ef1b78be1bcf 100644
--- a/spec/std/string/utf16_spec.cr
+++ b/spec/std/string/utf16_spec.cr
@@ -2,24 +2,34 @@ require "spec"
 
 describe "String UTF16" do
   describe "to_utf16" do
+    it "in the range U+0000..U+FF" do
+      encoded = "\u{0}hello\u{ff}".to_utf16
+      encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xff_u16])
+      encoded.unsafe_fetch(encoded.size).should eq 0_u16
+    end
+
     it "in the range U+0000..U+D7FF" do
       encoded = "\u{0}hello\u{d7ff}".to_utf16
       encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
+      encoded.unsafe_fetch(encoded.size).should eq 0_u16
     end
 
     it "in the range U+E000 to U+FFFF" do
       encoded = "\u{e000}\u{ffff}".to_utf16
       encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
+      encoded.unsafe_fetch(encoded.size).should eq 0_u16
     end
 
     it "in the range U+10000..U+10FFFF" do
       encoded = "\u{10000}\u{10FFFF}".to_utf16
       encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
+      encoded.unsafe_fetch(encoded.size).should eq 0_u16
     end
 
     it "in the range U+D800..U+DFFF" do
       encoded = "\u{D800}\u{DFFF}".to_utf16
       encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
+      encoded.unsafe_fetch(encoded.size).should eq 0_u16
     end
   end
 
diff --git a/src/string/utf16.cr b/src/string/utf16.cr
index 676f2bdf9074..f4f247b96156 100644
--- a/src/string/utf16.cr
+++ b/src/string/utf16.cr
@@ -14,7 +14,17 @@ class String
   def to_utf16 : Slice(UInt16)
     if ascii_only?
       # size == bytesize, so each char fits in one UInt16
-      return to_slice.map &.to_u16
+
+      # This is essentially equivalent to `to_slice.map(&.to_u16)` but also makes
+      # sure to allocate a null byte after the string.
+      slice = Slice(UInt16).new(bytesize + 1) do |i|
+        if i == bytesize
+          0_u16
+        else
+          unsafe_byte_at(i).to_u16
+        end
+      end
+      return slice[0, bytesize]
     end
 
     # size < bytesize, so we need to count the number of characters that are