From c83ae6da9aa8506275deddad22f002c917797912 Mon Sep 17 00:00:00 2001
From: Benoit Daloze <benoit.daloze@oracle.com>
Date: Wed, 5 Oct 2022 17:10:26 +0000
Subject: [PATCH] [GR-18163] Use the compatible encoding for
 String#{sub,gsub,index,rindex} (#2749)

PullRequest: truffleruby/3504
(cherry picked from commit bf6272d33beb18d6eee4ec95ecd9dcf9a07bc526)
---
 CHANGELOG.md                                  |  1 +
 spec/ruby/core/string/gsub_spec.rb            | 14 ++++-
 spec/ruby/core/string/index_spec.rb           |  8 +++
 spec/ruby/core/string/rindex_spec.rb          |  8 +++
 spec/ruby/core/string/sub_spec.rb             | 11 ++++
 .../truffleruby/core/string/StringNodes.java  | 63 ++++++++++---------
 src/main/ruby/truffleruby/core/string.rb      | 10 +--
 .../core/truffle/regexp_operations.rb         |  4 +-
 .../core/truffle/string_operations.rb         | 24 +++----
 9 files changed, 91 insertions(+), 52 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 173538ca04d4..0a1be8744505 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Bug fixes:
 * Fix `String#split` missing a value in its return array when called with a pattern of `" "` and a _limit_ value > 0 on a string with trailing whitespace where the limit hasn't been met (@nirvdrum).
 * Fix `Kernel#sleep` and `Mutex#sleep` for durations smaller than 1 millisecond (#2716, @eregon).
 * Fix `IO#{wait,wait_readable,wait_writable}` with a timeout > INT_MAX seconds (@eregon).
+* Use the compatible encoding for `String#{sub,gsub,index,rindex}` (#2749, @eregon).
 
 Compatibility:
 
diff --git a/spec/ruby/core/string/gsub_spec.rb b/spec/ruby/core/string/gsub_spec.rb
index 3211ebbd0ac8..65a36a95ba7d 100644
--- a/spec/ruby/core/string/gsub_spec.rb
+++ b/spec/ruby/core/string/gsub_spec.rb
@@ -210,8 +210,6 @@ def replacement.to_str() "hello_replacement" end
     end
   end
 
-  # Note: $~ cannot be tested because mspec messes with it
-
   it "sets $~ to MatchData of last match and nil when there's none" do
     'hello.'.gsub('hello', 'x')
     $~[0].should == 'hello'
@@ -225,6 +223,18 @@ def replacement.to_str() "hello_replacement" end
     'hello.'.gsub(/not/, 'x')
     $~.should == nil
   end
+
+  it "handles a pattern in a superset encoding" do
+    result = 'abc'.force_encoding(Encoding::US_ASCII).gsub('é', 'è')
+    result.should == 'abc'
+    result.encoding.should == Encoding::US_ASCII
+  end
+
+  it "handles a pattern in a subset encoding" do
+    result = 'été'.gsub('t'.force_encoding(Encoding::US_ASCII), 'u')
+    result.should == 'éué'
+    result.encoding.should == Encoding::UTF_8
+  end
 end
 
 describe "String#gsub with pattern and Hash" do
diff --git a/spec/ruby/core/string/index_spec.rb b/spec/ruby/core/string/index_spec.rb
index 5d77a88e4e00..2eeee9be8741 100644
--- a/spec/ruby/core/string/index_spec.rb
+++ b/spec/ruby/core/string/index_spec.rb
@@ -159,6 +159,14 @@
       "あれ".index char
     end.should raise_error(Encoding::CompatibilityError)
   end
+
+  it "handles a substring in a superset encoding" do
+    'abc'.force_encoding(Encoding::US_ASCII).index('é').should == nil
+  end
+
+  it "handles a substring in a subset encoding" do
+    'été'.index('t'.force_encoding(Encoding::US_ASCII)).should == 1
+  end
 end
 
 describe "String#index with Regexp" do
diff --git a/spec/ruby/core/string/rindex_spec.rb b/spec/ruby/core/string/rindex_spec.rb
index a3b437a1e478..e795105e1da1 100644
--- a/spec/ruby/core/string/rindex_spec.rb
+++ b/spec/ruby/core/string/rindex_spec.rb
@@ -196,6 +196,14 @@ def obj.method_missing(*args) 5 end
   it "raises a TypeError when given offset is nil" do
     -> { "str".rindex("st", nil) }.should raise_error(TypeError)
   end
+
+  it "handles a substring in a superset encoding" do
+    'abc'.force_encoding(Encoding::US_ASCII).rindex('é').should == nil
+  end
+
+  it "handles a substring in a subset encoding" do
+    'été'.rindex('t'.force_encoding(Encoding::US_ASCII)).should == 1
+  end
 end
 
 describe "String#rindex with Regexp" do
diff --git a/spec/ruby/core/string/sub_spec.rb b/spec/ruby/core/string/sub_spec.rb
index 9effe88c2789..e6b73448c51f 100644
--- a/spec/ruby/core/string/sub_spec.rb
+++ b/spec/ruby/core/string/sub_spec.rb
@@ -214,6 +214,17 @@
     "ababa".sub(/(b)/, '\\\\\1').should == "a\\baba"
   end
 
+  it "handles a pattern in a superset encoding" do
+    result = 'abc'.force_encoding(Encoding::US_ASCII).sub('é', 'è')
+    result.should == 'abc'
+    result.encoding.should == Encoding::US_ASCII
+  end
+
+  it "handles a pattern in a subset encoding" do
+    result = 'été'.sub('t'.force_encoding(Encoding::US_ASCII), 'u')
+    result.should == 'éué'
+    result.encoding.should == Encoding::UTF_8
+  end
 end
 
 describe "String#sub with pattern and block" do
diff --git a/src/main/java/org/truffleruby/core/string/StringNodes.java b/src/main/java/org/truffleruby/core/string/StringNodes.java
index 9bf33151c26e..f5a11fb6086a 100644
--- a/src/main/java/org/truffleruby/core/string/StringNodes.java
+++ b/src/main/java/org/truffleruby/core/string/StringNodes.java
@@ -3774,7 +3774,7 @@ protected Object findStringByteIndex(Object rubyString, Object rubyPattern, int
 
     }
 
-    @Primitive(name = "string_byte_character_index", lowerFixnum = 1)
+    @Primitive(name = "byte_index_to_character_index", lowerFixnum = 1)
     public abstract static class StringByteCharacterIndexNode extends PrimitiveArrayArgumentsNode {
         @Specialization
         protected int byteIndexToCodePointIndex(Object string, int byteIndex,
@@ -3786,8 +3786,20 @@ protected int byteIndexToCodePointIndex(Object string, int byteIndex,
         }
     }
 
+    // Named 'string_byte_index' in Rubinius.
+    @Primitive(name = "character_index_to_byte_index", lowerFixnum = 1)
+    public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode {
+        @Specialization
+        protected Object byteIndexFromCharIndex(Object string, int characterIndex,
+                @Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode,
+                @Cached RubyStringLibrary libString) {
+            return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex,
+                    libString.getTEncoding(string));
+        }
+    }
+
     /** Search pattern in string starting after offset characters, and return a character index or nil */
-    @Primitive(name = "string_character_index", lowerFixnum = 2)
+    @Primitive(name = "string_character_index", lowerFixnum = 3)
     public abstract static class StringCharacterIndexNode extends PrimitiveArrayArgumentsNode {
 
         protected final RubyStringLibrary libString = RubyStringLibrary.create();
@@ -3795,7 +3807,8 @@ public abstract static class StringCharacterIndexNode extends PrimitiveArrayArgu
         @Child SingleByteOptimizableNode singleByteOptimizableNode = SingleByteOptimizableNode.create();
 
         @Specialization(guards = "singleByteOptimizableNode.execute(string, stringEncoding)")
-        protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, int codePointOffset,
+        protected Object singleByteOptimizable(
+                Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset,
                 @Bind("libString.getTString(rubyString)") AbstractTruffleString string,
                 @Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding,
                 @Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern,
@@ -3808,12 +3821,11 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in
             // When single-byte optimizable, the byte length and the codepoint length are the same.
             int stringByteLength = string.byteLength(stringEncoding.tencoding);
 
-            assert codePointOffset + pattern.byteLength(
-                    patternEncoding.tencoding) <= stringByteLength : "already checked in the caller, String#index";
+            assert codePointOffset + pattern.byteLength(patternEncoding.tencoding) <= stringByteLength
+                    : "already checked in the caller, String#index";
 
-            int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset,
-                    stringByteLength,
-                    stringEncoding.tencoding);
+            int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset, stringByteLength,
+                    compatibleEncoding.tencoding);
 
             if (foundProfile.profile(found >= 0)) {
                 return found;
@@ -3823,7 +3835,8 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in
         }
 
         @Specialization(guards = "!singleByteOptimizableNode.execute(string, stringEncoding)")
-        protected Object multiByte(Object rubyString, Object rubyPattern, int codePointOffset,
+        protected Object multiByte(
+                Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset,
                 @Bind("libString.getTString(rubyString)") AbstractTruffleString string,
                 @Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding,
                 @Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern,
@@ -3838,7 +3851,7 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO
 
             int stringCodePointLength = codePointLengthNode.execute(string, stringEncoding.tencoding);
             int found = indexOfStringNode.execute(string, pattern, codePointOffset, stringCodePointLength,
-                    stringEncoding.tencoding);
+                    compatibleEncoding.tencoding);
 
             if (foundProfile.profile(found >= 0)) {
                 return found;
@@ -3849,11 +3862,12 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO
     }
 
     /** Search pattern in string starting after offset bytes, and return a byte index or nil */
-    @Primitive(name = "string_byte_index", lowerFixnum = 2)
+    @Primitive(name = "string_byte_index", lowerFixnum = 3)
     public abstract static class StringByteIndexNode extends PrimitiveArrayArgumentsNode {
 
         @Specialization
-        protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byteOffset,
+        protected Object stringByteIndex(
+                Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int byteOffset,
                 @Cached RubyStringLibrary libString,
                 @Cached RubyStringLibrary libPattern,
                 @Cached TruffleString.ByteIndexOfStringNode byteIndexOfStringNode,
@@ -3862,18 +3876,17 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte
             assert byteOffset >= 0;
 
             var string = libString.getTString(rubyString);
-            var stringEncoding = libString.getEncoding(rubyString).tencoding;
-            int stringByteLength = string.byteLength(stringEncoding);
+            int stringByteLength = libString.byteLength(rubyString);
 
             var pattern = libPattern.getTString(rubyPattern);
-            var patternEncoding = libPattern.getEncoding(rubyPattern).tencoding;
-            int patternByteLength = pattern.byteLength(patternEncoding);
+            int patternByteLength = libPattern.byteLength(rubyPattern);
 
             if (indexOutOfBoundsProfile.profile(byteOffset + patternByteLength > stringByteLength)) {
                 return nil;
             }
 
-            int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength, stringEncoding);
+            int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength,
+                    compatibleEncoding.tencoding);
             if (foundProfile.profile(found >= 0)) {
                 return found;
             }
@@ -3882,18 +3895,6 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte
         }
     }
 
-    // Named 'string_byte_index' in Rubinius.
-    @Primitive(name = "string_byte_index_from_char_index", lowerFixnum = 1)
-    public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode {
-        @Specialization
-        protected Object byteIndexFromCharIndex(Object string, int characterIndex,
-                @Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode,
-                @Cached RubyStringLibrary libString) {
-            return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex,
-                    libString.getTEncoding(string));
-        }
-    }
-
     // Port of Rubinius's String::previous_byte_index.
     //
     // This method takes a byte index, finds the corresponding character the byte index belongs to, and then returns
@@ -3984,7 +3985,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff
             assert byteOffset >= 0;
 
             // Throw an exception if the encodings are not compatible.
-            checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern);
+            var compatibleEncoding = checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern);
 
             var string = libString.getTString(rubyString);
             var stringEncoding = libString.getEncoding(rubyString).tencoding;
@@ -4007,7 +4008,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff
             }
 
             int result = lastByteIndexOfStringNode.execute(string, pattern, normalizedStart + patternByteLength, 0,
-                    stringEncoding);
+                    compatibleEncoding.tencoding);
 
             if (result < 0) {
                 noMatchProfile.enter();
diff --git a/src/main/ruby/truffleruby/core/string.rb b/src/main/ruby/truffleruby/core/string.rb
index 34be0133a4a1..f08aaba7be9b 100644
--- a/src/main/ruby/truffleruby/core/string.rb
+++ b/src/main/ruby/truffleruby/core/string.rb
@@ -1025,7 +1025,7 @@ def index(str, start=undefined)
     if Primitive.object_kind_of?(str, Regexp)
       Primitive.encoding_ensure_compatible self, str
 
-      start = Primitive.string_byte_index_from_char_index(self, start)
+      start = Primitive.character_index_to_byte_index(self, start)
       if match = Truffle::RegexpOperations.match_from(str, self, start)
         Primitive.regexp_last_match_set(Primitive.caller_special_variables, match)
         return match.begin(0)
@@ -1038,11 +1038,11 @@ def index(str, start=undefined)
     str = StringValue(str)
     return start if str == ''
 
-    Primitive.encoding_ensure_compatible_str self, str
+    enc = Primitive.encoding_ensure_compatible_str self, str
 
     return if start + str.size > size
 
-    Primitive.string_character_index(self, str, start)
+    Primitive.string_character_index(self, str, enc, start)
   end
 
   def initialize(other = undefined, capacity: nil, encoding: nil)
@@ -1064,7 +1064,7 @@ def rindex(sub, finish=undefined)
       finish = size if finish >= size
     end
 
-    byte_finish = Primitive.string_byte_index_from_char_index(self, finish)
+    byte_finish = Primitive.character_index_to_byte_index(self, finish)
 
     if Primitive.object_kind_of?(sub, Regexp)
       Primitive.encoding_ensure_compatible self, sub
@@ -1085,7 +1085,7 @@ def rindex(sub, finish=undefined)
 
       Primitive.encoding_ensure_compatible_str self, needle
       if byte_index = Primitive.find_string_reverse(self, needle, byte_finish)
-        return Primitive.string_byte_character_index(self, byte_index)
+        return Primitive.byte_index_to_character_index(self, byte_index)
       end
     end
 
diff --git a/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb b/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb
index 76d487c6e202..57382f940bad 100644
--- a/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb
+++ b/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb
@@ -37,7 +37,7 @@ def self.match(re, str, pos=0)
 
       pos = pos < 0 ? pos + str.size : pos
       return nil if pos < 0 or pos > str.size
-      pos = Primitive.string_byte_index_from_char_index(str, pos)
+      pos = Primitive.character_index_to_byte_index(str, pos)
 
       search_region(re, str, pos, str.bytesize, true, true)
     end
@@ -50,7 +50,7 @@ def self.match?(re, str, pos=0)
 
       pos = pos < 0 ? pos + str.size : pos
       return false if pos < 0 or pos > str.size
-      pos = Primitive.string_byte_index_from_char_index(str, pos)
+      pos = Primitive.character_index_to_byte_index(str, pos)
 
       search_region(re, str, pos, str.bytesize, true, false)
     end
diff --git a/src/main/ruby/truffleruby/core/truffle/string_operations.rb b/src/main/ruby/truffleruby/core/truffle/string_operations.rb
index 6e6f95aa56af..8e36e0eabbab 100644
--- a/src/main/ruby/truffleruby/core/truffle/string_operations.rb
+++ b/src/main/ruby/truffleruby/core/truffle/string_operations.rb
@@ -105,7 +105,9 @@ def self.gsub_regexp_matches(global, orig, pattern)
     def self.gsub_string_matches(global, orig, pattern)
       res = []
       offset = 0
-      while index = byte_index(orig, pattern, offset)
+      enc = Primitive.encoding_ensure_compatible_str orig, pattern
+
+      while index = byte_index(orig, pattern, enc, offset)
         match = Primitive.matchdata_create_single_group(pattern, orig.dup, index, index + pattern.bytesize)
         res << match
         break unless global
@@ -285,7 +287,7 @@ def self.validate_case_mapping_options(options, downcasing)
     end
 
     # MRI: rb_str_byteindex_m
-    def self.byte_index(src, str, start=0)
+    def self.byte_index(src, str, enc, start = 0)
       start += src.bytesize if start < 0
       if start < 0 or start > src.bytesize
         Primitive.regexp_last_match_set(Primitive.caller_special_variables, nil) if Primitive.object_kind_of?(str, Regexp)
@@ -294,9 +296,7 @@ def self.byte_index(src, str, start=0)
 
       return start if str == ''
 
-      Primitive.encoding_ensure_compatible_str src, str
-
-      Primitive.string_byte_index(src, str, start)
+      Primitive.string_byte_index(src, str, enc, start)
     end
 
     def self.subpattern(string, pattern, capture)
@@ -320,7 +320,7 @@ def self.assign_index(string, index, count, replacement)
         raise IndexError, "index #{index} out of string"
       end
 
-      unless bi = Primitive.string_byte_index_from_char_index(string, index)
+      unless bi = Primitive.character_index_to_byte_index(string, index)
         raise IndexError, "unable to find character at: #{index}"
       end
 
@@ -335,10 +335,10 @@ def self.assign_index(string, index, count, replacement)
         if total >= string.size
           bs = string.bytesize - bi
         else
-          bs = Primitive.string_byte_index_from_char_index(string, total) - bi
+          bs = Primitive.character_index_to_byte_index(string, total) - bi
         end
       else
-        bs = index == string.size ? 0 : Primitive.string_byte_index_from_char_index(string, index + 1) - bi
+        bs = index == string.size ? 0 : Primitive.character_index_to_byte_index(string, index + 1) - bi
       end
 
       replacement = StringValue replacement
@@ -364,7 +364,7 @@ def self.assign_range(string, index, replacement)
 
       raise RangeError, "#{index.first} is out of range" if start < 0 or start > string.size
 
-      bi = Primitive.string_byte_index_from_char_index(string, start)
+      bi = Primitive.character_index_to_byte_index(string, start)
       raise IndexError, "unable to find character at: #{start}" unless bi
 
       if stop < start
@@ -372,7 +372,7 @@ def self.assign_range(string, index, replacement)
       elsif stop >= string.size
         bs = string.bytesize - bi
       else
-        bs = Primitive.string_byte_index_from_char_index(string, stop + 1) - bi
+        bs = Primitive.character_index_to_byte_index(string, stop + 1) - bi
       end
 
       replacement = StringValue replacement
@@ -406,8 +406,8 @@ def self.assign_regexp(string, index, count, replacement)
       replacement = StringValue replacement
       enc = Primitive.encoding_ensure_compatible_str string, replacement
 
-      bi = Primitive.string_byte_index_from_char_index(string, match.begin(count))
-      bs = Primitive.string_byte_index_from_char_index(string, match.end(count)) - bi
+      bi = Primitive.character_index_to_byte_index(string, match.begin(count))
+      bs = Primitive.character_index_to_byte_index(string, match.end(count)) - bi
 
       Primitive.string_splice(string, replacement, bi, bs, enc)
     end