From c83ae6da9aa8506275deddad22f002c917797912 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Wed, 5 Oct 2022 17:10:26 +0000 Subject: [PATCH] [GR-18163] Use the compatible encoding for String#{sub,gsub,index,rindex} (#2749) PullRequest: truffleruby/3504 (cherry picked from commit bf6272d33beb18d6eee4ec95ecd9dcf9a07bc526) --- CHANGELOG.md | 1 + spec/ruby/core/string/gsub_spec.rb | 14 ++++- spec/ruby/core/string/index_spec.rb | 8 +++ spec/ruby/core/string/rindex_spec.rb | 8 +++ spec/ruby/core/string/sub_spec.rb | 11 ++++ .../truffleruby/core/string/StringNodes.java | 63 ++++++++++--------- src/main/ruby/truffleruby/core/string.rb | 10 +-- .../core/truffle/regexp_operations.rb | 4 +- .../core/truffle/string_operations.rb | 24 +++---- 9 files changed, 91 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 173538ca04d4..0a1be8744505 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Bug fixes: * Fix `String#split` missing a value in its return array when called with a pattern of `" "` and a _limit_ value > 0 on a string with trailing whitespace where the limit hasn't been met (@nirvdrum). * Fix `Kernel#sleep` and `Mutex#sleep` for durations smaller than 1 millisecond (#2716, @eregon). * Fix `IO#{wait,wait_readable,wait_writable}` with a timeout > INT_MAX seconds (@eregon). +* Use the compatible encoding for `String#{sub,gsub,index,rindex}` (#2749, @eregon). Compatibility: diff --git a/spec/ruby/core/string/gsub_spec.rb b/spec/ruby/core/string/gsub_spec.rb index 3211ebbd0ac8..65a36a95ba7d 100644 --- a/spec/ruby/core/string/gsub_spec.rb +++ b/spec/ruby/core/string/gsub_spec.rb @@ -210,8 +210,6 @@ def replacement.to_str() "hello_replacement" end end end - # Note: $~ cannot be tested because mspec messes with it - it "sets $~ to MatchData of last match and nil when there's none" do 'hello.'.gsub('hello', 'x') $~[0].should == 'hello' @@ -225,6 +223,18 @@ def replacement.to_str() "hello_replacement" end 'hello.'.gsub(/not/, 'x') $~.should == nil end + + it "handles a pattern in a superset encoding" do + result = 'abc'.force_encoding(Encoding::US_ASCII).gsub('é', 'è') + result.should == 'abc' + result.encoding.should == Encoding::US_ASCII + end + + it "handles a pattern in a subset encoding" do + result = 'été'.gsub('t'.force_encoding(Encoding::US_ASCII), 'u') + result.should == 'éué' + result.encoding.should == Encoding::UTF_8 + end end describe "String#gsub with pattern and Hash" do diff --git a/spec/ruby/core/string/index_spec.rb b/spec/ruby/core/string/index_spec.rb index 5d77a88e4e00..2eeee9be8741 100644 --- a/spec/ruby/core/string/index_spec.rb +++ b/spec/ruby/core/string/index_spec.rb @@ -159,6 +159,14 @@ "あれ".index char end.should raise_error(Encoding::CompatibilityError) end + + it "handles a substring in a superset encoding" do + 'abc'.force_encoding(Encoding::US_ASCII).index('é').should == nil + end + + it "handles a substring in a subset encoding" do + 'été'.index('t'.force_encoding(Encoding::US_ASCII)).should == 1 + end end describe "String#index with Regexp" do diff --git a/spec/ruby/core/string/rindex_spec.rb b/spec/ruby/core/string/rindex_spec.rb index a3b437a1e478..e795105e1da1 100644 --- a/spec/ruby/core/string/rindex_spec.rb +++ b/spec/ruby/core/string/rindex_spec.rb @@ -196,6 +196,14 @@ def obj.method_missing(*args) 5 end it "raises a TypeError when given offset is nil" do -> { "str".rindex("st", nil) }.should raise_error(TypeError) end + + it "handles a substring in a superset encoding" do + 'abc'.force_encoding(Encoding::US_ASCII).rindex('é').should == nil + end + + it "handles a substring in a subset encoding" do + 'été'.rindex('t'.force_encoding(Encoding::US_ASCII)).should == 1 + end end describe "String#rindex with Regexp" do diff --git a/spec/ruby/core/string/sub_spec.rb b/spec/ruby/core/string/sub_spec.rb index 9effe88c2789..e6b73448c51f 100644 --- a/spec/ruby/core/string/sub_spec.rb +++ b/spec/ruby/core/string/sub_spec.rb @@ -214,6 +214,17 @@ "ababa".sub(/(b)/, '\\\\\1').should == "a\\baba" end + it "handles a pattern in a superset encoding" do + result = 'abc'.force_encoding(Encoding::US_ASCII).sub('é', 'è') + result.should == 'abc' + result.encoding.should == Encoding::US_ASCII + end + + it "handles a pattern in a subset encoding" do + result = 'été'.sub('t'.force_encoding(Encoding::US_ASCII), 'u') + result.should == 'éué' + result.encoding.should == Encoding::UTF_8 + end end describe "String#sub with pattern and block" do diff --git a/src/main/java/org/truffleruby/core/string/StringNodes.java b/src/main/java/org/truffleruby/core/string/StringNodes.java index 9bf33151c26e..f5a11fb6086a 100644 --- a/src/main/java/org/truffleruby/core/string/StringNodes.java +++ b/src/main/java/org/truffleruby/core/string/StringNodes.java @@ -3774,7 +3774,7 @@ protected Object findStringByteIndex(Object rubyString, Object rubyPattern, int } - @Primitive(name = "string_byte_character_index", lowerFixnum = 1) + @Primitive(name = "byte_index_to_character_index", lowerFixnum = 1) public abstract static class StringByteCharacterIndexNode extends PrimitiveArrayArgumentsNode { @Specialization protected int byteIndexToCodePointIndex(Object string, int byteIndex, @@ -3786,8 +3786,20 @@ protected int byteIndexToCodePointIndex(Object string, int byteIndex, } } + // Named 'string_byte_index' in Rubinius. + @Primitive(name = "character_index_to_byte_index", lowerFixnum = 1) + public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode { + @Specialization + protected Object byteIndexFromCharIndex(Object string, int characterIndex, + @Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode, + @Cached RubyStringLibrary libString) { + return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex, + libString.getTEncoding(string)); + } + } + /** Search pattern in string starting after offset characters, and return a character index or nil */ - @Primitive(name = "string_character_index", lowerFixnum = 2) + @Primitive(name = "string_character_index", lowerFixnum = 3) public abstract static class StringCharacterIndexNode extends PrimitiveArrayArgumentsNode { protected final RubyStringLibrary libString = RubyStringLibrary.create(); @@ -3795,7 +3807,8 @@ public abstract static class StringCharacterIndexNode extends PrimitiveArrayArgu @Child SingleByteOptimizableNode singleByteOptimizableNode = SingleByteOptimizableNode.create(); @Specialization(guards = "singleByteOptimizableNode.execute(string, stringEncoding)") - protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, int codePointOffset, + protected Object singleByteOptimizable( + Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset, @Bind("libString.getTString(rubyString)") AbstractTruffleString string, @Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding, @Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern, @@ -3808,12 +3821,11 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in // When single-byte optimizable, the byte length and the codepoint length are the same. int stringByteLength = string.byteLength(stringEncoding.tencoding); - assert codePointOffset + pattern.byteLength( - patternEncoding.tencoding) <= stringByteLength : "already checked in the caller, String#index"; + assert codePointOffset + pattern.byteLength(patternEncoding.tencoding) <= stringByteLength + : "already checked in the caller, String#index"; - int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset, - stringByteLength, - stringEncoding.tencoding); + int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset, stringByteLength, + compatibleEncoding.tencoding); if (foundProfile.profile(found >= 0)) { return found; @@ -3823,7 +3835,8 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in } @Specialization(guards = "!singleByteOptimizableNode.execute(string, stringEncoding)") - protected Object multiByte(Object rubyString, Object rubyPattern, int codePointOffset, + protected Object multiByte( + Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset, @Bind("libString.getTString(rubyString)") AbstractTruffleString string, @Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding, @Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern, @@ -3838,7 +3851,7 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO int stringCodePointLength = codePointLengthNode.execute(string, stringEncoding.tencoding); int found = indexOfStringNode.execute(string, pattern, codePointOffset, stringCodePointLength, - stringEncoding.tencoding); + compatibleEncoding.tencoding); if (foundProfile.profile(found >= 0)) { return found; @@ -3849,11 +3862,12 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO } /** Search pattern in string starting after offset bytes, and return a byte index or nil */ - @Primitive(name = "string_byte_index", lowerFixnum = 2) + @Primitive(name = "string_byte_index", lowerFixnum = 3) public abstract static class StringByteIndexNode extends PrimitiveArrayArgumentsNode { @Specialization - protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byteOffset, + protected Object stringByteIndex( + Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int byteOffset, @Cached RubyStringLibrary libString, @Cached RubyStringLibrary libPattern, @Cached TruffleString.ByteIndexOfStringNode byteIndexOfStringNode, @@ -3862,18 +3876,17 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte assert byteOffset >= 0; var string = libString.getTString(rubyString); - var stringEncoding = libString.getEncoding(rubyString).tencoding; - int stringByteLength = string.byteLength(stringEncoding); + int stringByteLength = libString.byteLength(rubyString); var pattern = libPattern.getTString(rubyPattern); - var patternEncoding = libPattern.getEncoding(rubyPattern).tencoding; - int patternByteLength = pattern.byteLength(patternEncoding); + int patternByteLength = libPattern.byteLength(rubyPattern); if (indexOutOfBoundsProfile.profile(byteOffset + patternByteLength > stringByteLength)) { return nil; } - int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength, stringEncoding); + int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength, + compatibleEncoding.tencoding); if (foundProfile.profile(found >= 0)) { return found; } @@ -3882,18 +3895,6 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte } } - // Named 'string_byte_index' in Rubinius. - @Primitive(name = "string_byte_index_from_char_index", lowerFixnum = 1) - public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode { - @Specialization - protected Object byteIndexFromCharIndex(Object string, int characterIndex, - @Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode, - @Cached RubyStringLibrary libString) { - return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex, - libString.getTEncoding(string)); - } - } - // Port of Rubinius's String::previous_byte_index. // // This method takes a byte index, finds the corresponding character the byte index belongs to, and then returns @@ -3984,7 +3985,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff assert byteOffset >= 0; // Throw an exception if the encodings are not compatible. - checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern); + var compatibleEncoding = checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern); var string = libString.getTString(rubyString); var stringEncoding = libString.getEncoding(rubyString).tencoding; @@ -4007,7 +4008,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff } int result = lastByteIndexOfStringNode.execute(string, pattern, normalizedStart + patternByteLength, 0, - stringEncoding); + compatibleEncoding.tencoding); if (result < 0) { noMatchProfile.enter(); diff --git a/src/main/ruby/truffleruby/core/string.rb b/src/main/ruby/truffleruby/core/string.rb index 34be0133a4a1..f08aaba7be9b 100644 --- a/src/main/ruby/truffleruby/core/string.rb +++ b/src/main/ruby/truffleruby/core/string.rb @@ -1025,7 +1025,7 @@ def index(str, start=undefined) if Primitive.object_kind_of?(str, Regexp) Primitive.encoding_ensure_compatible self, str - start = Primitive.string_byte_index_from_char_index(self, start) + start = Primitive.character_index_to_byte_index(self, start) if match = Truffle::RegexpOperations.match_from(str, self, start) Primitive.regexp_last_match_set(Primitive.caller_special_variables, match) return match.begin(0) @@ -1038,11 +1038,11 @@ def index(str, start=undefined) str = StringValue(str) return start if str == '' - Primitive.encoding_ensure_compatible_str self, str + enc = Primitive.encoding_ensure_compatible_str self, str return if start + str.size > size - Primitive.string_character_index(self, str, start) + Primitive.string_character_index(self, str, enc, start) end def initialize(other = undefined, capacity: nil, encoding: nil) @@ -1064,7 +1064,7 @@ def rindex(sub, finish=undefined) finish = size if finish >= size end - byte_finish = Primitive.string_byte_index_from_char_index(self, finish) + byte_finish = Primitive.character_index_to_byte_index(self, finish) if Primitive.object_kind_of?(sub, Regexp) Primitive.encoding_ensure_compatible self, sub @@ -1085,7 +1085,7 @@ def rindex(sub, finish=undefined) Primitive.encoding_ensure_compatible_str self, needle if byte_index = Primitive.find_string_reverse(self, needle, byte_finish) - return Primitive.string_byte_character_index(self, byte_index) + return Primitive.byte_index_to_character_index(self, byte_index) end end diff --git a/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb b/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb index 76d487c6e202..57382f940bad 100644 --- a/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/regexp_operations.rb @@ -37,7 +37,7 @@ def self.match(re, str, pos=0) pos = pos < 0 ? pos + str.size : pos return nil if pos < 0 or pos > str.size - pos = Primitive.string_byte_index_from_char_index(str, pos) + pos = Primitive.character_index_to_byte_index(str, pos) search_region(re, str, pos, str.bytesize, true, true) end @@ -50,7 +50,7 @@ def self.match?(re, str, pos=0) pos = pos < 0 ? pos + str.size : pos return false if pos < 0 or pos > str.size - pos = Primitive.string_byte_index_from_char_index(str, pos) + pos = Primitive.character_index_to_byte_index(str, pos) search_region(re, str, pos, str.bytesize, true, false) end diff --git a/src/main/ruby/truffleruby/core/truffle/string_operations.rb b/src/main/ruby/truffleruby/core/truffle/string_operations.rb index 6e6f95aa56af..8e36e0eabbab 100644 --- a/src/main/ruby/truffleruby/core/truffle/string_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/string_operations.rb @@ -105,7 +105,9 @@ def self.gsub_regexp_matches(global, orig, pattern) def self.gsub_string_matches(global, orig, pattern) res = [] offset = 0 - while index = byte_index(orig, pattern, offset) + enc = Primitive.encoding_ensure_compatible_str orig, pattern + + while index = byte_index(orig, pattern, enc, offset) match = Primitive.matchdata_create_single_group(pattern, orig.dup, index, index + pattern.bytesize) res << match break unless global @@ -285,7 +287,7 @@ def self.validate_case_mapping_options(options, downcasing) end # MRI: rb_str_byteindex_m - def self.byte_index(src, str, start=0) + def self.byte_index(src, str, enc, start = 0) start += src.bytesize if start < 0 if start < 0 or start > src.bytesize Primitive.regexp_last_match_set(Primitive.caller_special_variables, nil) if Primitive.object_kind_of?(str, Regexp) @@ -294,9 +296,7 @@ def self.byte_index(src, str, start=0) return start if str == '' - Primitive.encoding_ensure_compatible_str src, str - - Primitive.string_byte_index(src, str, start) + Primitive.string_byte_index(src, str, enc, start) end def self.subpattern(string, pattern, capture) @@ -320,7 +320,7 @@ def self.assign_index(string, index, count, replacement) raise IndexError, "index #{index} out of string" end - unless bi = Primitive.string_byte_index_from_char_index(string, index) + unless bi = Primitive.character_index_to_byte_index(string, index) raise IndexError, "unable to find character at: #{index}" end @@ -335,10 +335,10 @@ def self.assign_index(string, index, count, replacement) if total >= string.size bs = string.bytesize - bi else - bs = Primitive.string_byte_index_from_char_index(string, total) - bi + bs = Primitive.character_index_to_byte_index(string, total) - bi end else - bs = index == string.size ? 0 : Primitive.string_byte_index_from_char_index(string, index + 1) - bi + bs = index == string.size ? 0 : Primitive.character_index_to_byte_index(string, index + 1) - bi end replacement = StringValue replacement @@ -364,7 +364,7 @@ def self.assign_range(string, index, replacement) raise RangeError, "#{index.first} is out of range" if start < 0 or start > string.size - bi = Primitive.string_byte_index_from_char_index(string, start) + bi = Primitive.character_index_to_byte_index(string, start) raise IndexError, "unable to find character at: #{start}" unless bi if stop < start @@ -372,7 +372,7 @@ def self.assign_range(string, index, replacement) elsif stop >= string.size bs = string.bytesize - bi else - bs = Primitive.string_byte_index_from_char_index(string, stop + 1) - bi + bs = Primitive.character_index_to_byte_index(string, stop + 1) - bi end replacement = StringValue replacement @@ -406,8 +406,8 @@ def self.assign_regexp(string, index, count, replacement) replacement = StringValue replacement enc = Primitive.encoding_ensure_compatible_str string, replacement - bi = Primitive.string_byte_index_from_char_index(string, match.begin(count)) - bs = Primitive.string_byte_index_from_char_index(string, match.end(count)) - bi + bi = Primitive.character_index_to_byte_index(string, match.begin(count)) + bs = Primitive.character_index_to_byte_index(string, match.end(count)) - bi Primitive.string_splice(string, replacement, bi, bs, enc) end