Skip to content

Commit

Permalink
[GR-18163] Use the compatible encoding for String#{sub,gsub,index,rin…
Browse files Browse the repository at this point in the history
…dex} (#2749)

PullRequest: truffleruby/3504
(cherry picked from commit bf6272d)
  • Loading branch information
eregon committed Oct 6, 2022
1 parent 84b886d commit c83ae6d
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 52 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Bug fixes:
* Fix `String#split` missing a value in its return array when called with a pattern of `" "` and a _limit_ value > 0 on a string with trailing whitespace where the limit hasn't been met (@nirvdrum).
* Fix `Kernel#sleep` and `Mutex#sleep` for durations smaller than 1 millisecond (#2716, @eregon).
* Fix `IO#{wait,wait_readable,wait_writable}` with a timeout > INT_MAX seconds (@eregon).
* Use the compatible encoding for `String#{sub,gsub,index,rindex}` (#2749, @eregon).

Compatibility:

Expand Down
14 changes: 12 additions & 2 deletions spec/ruby/core/string/gsub_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,6 @@ def replacement.to_str() "hello_replacement" end
end
end

# Note: $~ cannot be tested because mspec messes with it

it "sets $~ to MatchData of last match and nil when there's none" do
'hello.'.gsub('hello', 'x')
$~[0].should == 'hello'
Expand All @@ -225,6 +223,18 @@ def replacement.to_str() "hello_replacement" end
'hello.'.gsub(/not/, 'x')
$~.should == nil
end

it "handles a pattern in a superset encoding" do
result = 'abc'.force_encoding(Encoding::US_ASCII).gsub('é', 'è')
result.should == 'abc'
result.encoding.should == Encoding::US_ASCII
end

it "handles a pattern in a subset encoding" do
result = 'été'.gsub('t'.force_encoding(Encoding::US_ASCII), 'u')
result.should == 'éué'
result.encoding.should == Encoding::UTF_8
end
end

describe "String#gsub with pattern and Hash" do
Expand Down
8 changes: 8 additions & 0 deletions spec/ruby/core/string/index_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@
"あれ".index char
end.should raise_error(Encoding::CompatibilityError)
end

it "handles a substring in a superset encoding" do
'abc'.force_encoding(Encoding::US_ASCII).index('é').should == nil
end

it "handles a substring in a subset encoding" do
'été'.index('t'.force_encoding(Encoding::US_ASCII)).should == 1
end
end

describe "String#index with Regexp" do
Expand Down
8 changes: 8 additions & 0 deletions spec/ruby/core/string/rindex_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ def obj.method_missing(*args) 5 end
it "raises a TypeError when given offset is nil" do
-> { "str".rindex("st", nil) }.should raise_error(TypeError)
end

it "handles a substring in a superset encoding" do
'abc'.force_encoding(Encoding::US_ASCII).rindex('é').should == nil
end

it "handles a substring in a subset encoding" do
'été'.rindex('t'.force_encoding(Encoding::US_ASCII)).should == 1
end
end

describe "String#rindex with Regexp" do
Expand Down
11 changes: 11 additions & 0 deletions spec/ruby/core/string/sub_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,17 @@
"ababa".sub(/(b)/, '\\\\\1').should == "a\\baba"
end

it "handles a pattern in a superset encoding" do
result = 'abc'.force_encoding(Encoding::US_ASCII).sub('é', 'è')
result.should == 'abc'
result.encoding.should == Encoding::US_ASCII
end

it "handles a pattern in a subset encoding" do
result = 'été'.sub('t'.force_encoding(Encoding::US_ASCII), 'u')
result.should == 'éué'
result.encoding.should == Encoding::UTF_8
end
end

describe "String#sub with pattern and block" do
Expand Down
63 changes: 32 additions & 31 deletions src/main/java/org/truffleruby/core/string/StringNodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -3774,7 +3774,7 @@ protected Object findStringByteIndex(Object rubyString, Object rubyPattern, int

}

@Primitive(name = "string_byte_character_index", lowerFixnum = 1)
@Primitive(name = "byte_index_to_character_index", lowerFixnum = 1)
public abstract static class StringByteCharacterIndexNode extends PrimitiveArrayArgumentsNode {
@Specialization
protected int byteIndexToCodePointIndex(Object string, int byteIndex,
Expand All @@ -3786,16 +3786,29 @@ protected int byteIndexToCodePointIndex(Object string, int byteIndex,
}
}

// Named 'string_byte_index' in Rubinius.
@Primitive(name = "character_index_to_byte_index", lowerFixnum = 1)
public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode {
@Specialization
protected Object byteIndexFromCharIndex(Object string, int characterIndex,
@Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode,
@Cached RubyStringLibrary libString) {
return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex,
libString.getTEncoding(string));
}
}

/** Search pattern in string starting after offset characters, and return a character index or nil */
@Primitive(name = "string_character_index", lowerFixnum = 2)
@Primitive(name = "string_character_index", lowerFixnum = 3)
public abstract static class StringCharacterIndexNode extends PrimitiveArrayArgumentsNode {

protected final RubyStringLibrary libString = RubyStringLibrary.create();
protected final RubyStringLibrary libPattern = RubyStringLibrary.create();
@Child SingleByteOptimizableNode singleByteOptimizableNode = SingleByteOptimizableNode.create();

@Specialization(guards = "singleByteOptimizableNode.execute(string, stringEncoding)")
protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, int codePointOffset,
protected Object singleByteOptimizable(
Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset,
@Bind("libString.getTString(rubyString)") AbstractTruffleString string,
@Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding,
@Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern,
Expand All @@ -3808,12 +3821,11 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in
// When single-byte optimizable, the byte length and the codepoint length are the same.
int stringByteLength = string.byteLength(stringEncoding.tencoding);

assert codePointOffset + pattern.byteLength(
patternEncoding.tencoding) <= stringByteLength : "already checked in the caller, String#index";
assert codePointOffset + pattern.byteLength(patternEncoding.tencoding) <= stringByteLength
: "already checked in the caller, String#index";

int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset,
stringByteLength,
stringEncoding.tencoding);
int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset, stringByteLength,
compatibleEncoding.tencoding);

if (foundProfile.profile(found >= 0)) {
return found;
Expand All @@ -3823,7 +3835,8 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in
}

@Specialization(guards = "!singleByteOptimizableNode.execute(string, stringEncoding)")
protected Object multiByte(Object rubyString, Object rubyPattern, int codePointOffset,
protected Object multiByte(
Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset,
@Bind("libString.getTString(rubyString)") AbstractTruffleString string,
@Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding,
@Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern,
Expand All @@ -3838,7 +3851,7 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO

int stringCodePointLength = codePointLengthNode.execute(string, stringEncoding.tencoding);
int found = indexOfStringNode.execute(string, pattern, codePointOffset, stringCodePointLength,
stringEncoding.tencoding);
compatibleEncoding.tencoding);

if (foundProfile.profile(found >= 0)) {
return found;
Expand All @@ -3849,11 +3862,12 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO
}

/** Search pattern in string starting after offset bytes, and return a byte index or nil */
@Primitive(name = "string_byte_index", lowerFixnum = 2)
@Primitive(name = "string_byte_index", lowerFixnum = 3)
public abstract static class StringByteIndexNode extends PrimitiveArrayArgumentsNode {

@Specialization
protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byteOffset,
protected Object stringByteIndex(
Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int byteOffset,
@Cached RubyStringLibrary libString,
@Cached RubyStringLibrary libPattern,
@Cached TruffleString.ByteIndexOfStringNode byteIndexOfStringNode,
Expand All @@ -3862,18 +3876,17 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte
assert byteOffset >= 0;

var string = libString.getTString(rubyString);
var stringEncoding = libString.getEncoding(rubyString).tencoding;
int stringByteLength = string.byteLength(stringEncoding);
int stringByteLength = libString.byteLength(rubyString);

var pattern = libPattern.getTString(rubyPattern);
var patternEncoding = libPattern.getEncoding(rubyPattern).tencoding;
int patternByteLength = pattern.byteLength(patternEncoding);
int patternByteLength = libPattern.byteLength(rubyPattern);

if (indexOutOfBoundsProfile.profile(byteOffset + patternByteLength > stringByteLength)) {
return nil;
}

int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength, stringEncoding);
int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength,
compatibleEncoding.tencoding);
if (foundProfile.profile(found >= 0)) {
return found;
}
Expand All @@ -3882,18 +3895,6 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte
}
}

// Named 'string_byte_index' in Rubinius.
@Primitive(name = "string_byte_index_from_char_index", lowerFixnum = 1)
public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode {
@Specialization
protected Object byteIndexFromCharIndex(Object string, int characterIndex,
@Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode,
@Cached RubyStringLibrary libString) {
return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex,
libString.getTEncoding(string));
}
}

// Port of Rubinius's String::previous_byte_index.
//
// This method takes a byte index, finds the corresponding character the byte index belongs to, and then returns
Expand Down Expand Up @@ -3984,7 +3985,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff
assert byteOffset >= 0;

// Throw an exception if the encodings are not compatible.
checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern);
var compatibleEncoding = checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern);

var string = libString.getTString(rubyString);
var stringEncoding = libString.getEncoding(rubyString).tencoding;
Expand All @@ -4007,7 +4008,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff
}

int result = lastByteIndexOfStringNode.execute(string, pattern, normalizedStart + patternByteLength, 0,
stringEncoding);
compatibleEncoding.tencoding);

if (result < 0) {
noMatchProfile.enter();
Expand Down
10 changes: 5 additions & 5 deletions src/main/ruby/truffleruby/core/string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ def index(str, start=undefined)
if Primitive.object_kind_of?(str, Regexp)
Primitive.encoding_ensure_compatible self, str

start = Primitive.string_byte_index_from_char_index(self, start)
start = Primitive.character_index_to_byte_index(self, start)
if match = Truffle::RegexpOperations.match_from(str, self, start)
Primitive.regexp_last_match_set(Primitive.caller_special_variables, match)
return match.begin(0)
Expand All @@ -1038,11 +1038,11 @@ def index(str, start=undefined)
str = StringValue(str)
return start if str == ''

Primitive.encoding_ensure_compatible_str self, str
enc = Primitive.encoding_ensure_compatible_str self, str

return if start + str.size > size

Primitive.string_character_index(self, str, start)
Primitive.string_character_index(self, str, enc, start)
end

def initialize(other = undefined, capacity: nil, encoding: nil)
Expand All @@ -1064,7 +1064,7 @@ def rindex(sub, finish=undefined)
finish = size if finish >= size
end

byte_finish = Primitive.string_byte_index_from_char_index(self, finish)
byte_finish = Primitive.character_index_to_byte_index(self, finish)

if Primitive.object_kind_of?(sub, Regexp)
Primitive.encoding_ensure_compatible self, sub
Expand All @@ -1085,7 +1085,7 @@ def rindex(sub, finish=undefined)

Primitive.encoding_ensure_compatible_str self, needle
if byte_index = Primitive.find_string_reverse(self, needle, byte_finish)
return Primitive.string_byte_character_index(self, byte_index)
return Primitive.byte_index_to_character_index(self, byte_index)
end
end

Expand Down
4 changes: 2 additions & 2 deletions src/main/ruby/truffleruby/core/truffle/regexp_operations.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def self.match(re, str, pos=0)

pos = pos < 0 ? pos + str.size : pos
return nil if pos < 0 or pos > str.size
pos = Primitive.string_byte_index_from_char_index(str, pos)
pos = Primitive.character_index_to_byte_index(str, pos)

search_region(re, str, pos, str.bytesize, true, true)
end
Expand All @@ -50,7 +50,7 @@ def self.match?(re, str, pos=0)

pos = pos < 0 ? pos + str.size : pos
return false if pos < 0 or pos > str.size
pos = Primitive.string_byte_index_from_char_index(str, pos)
pos = Primitive.character_index_to_byte_index(str, pos)

search_region(re, str, pos, str.bytesize, true, false)
end
Expand Down
24 changes: 12 additions & 12 deletions src/main/ruby/truffleruby/core/truffle/string_operations.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ def self.gsub_regexp_matches(global, orig, pattern)
def self.gsub_string_matches(global, orig, pattern)
res = []
offset = 0
while index = byte_index(orig, pattern, offset)
enc = Primitive.encoding_ensure_compatible_str orig, pattern

while index = byte_index(orig, pattern, enc, offset)
match = Primitive.matchdata_create_single_group(pattern, orig.dup, index, index + pattern.bytesize)
res << match
break unless global
Expand Down Expand Up @@ -285,7 +287,7 @@ def self.validate_case_mapping_options(options, downcasing)
end

# MRI: rb_str_byteindex_m
def self.byte_index(src, str, start=0)
def self.byte_index(src, str, enc, start = 0)
start += src.bytesize if start < 0
if start < 0 or start > src.bytesize
Primitive.regexp_last_match_set(Primitive.caller_special_variables, nil) if Primitive.object_kind_of?(str, Regexp)
Expand All @@ -294,9 +296,7 @@ def self.byte_index(src, str, start=0)

return start if str == ''

Primitive.encoding_ensure_compatible_str src, str

Primitive.string_byte_index(src, str, start)
Primitive.string_byte_index(src, str, enc, start)
end

def self.subpattern(string, pattern, capture)
Expand All @@ -320,7 +320,7 @@ def self.assign_index(string, index, count, replacement)
raise IndexError, "index #{index} out of string"
end

unless bi = Primitive.string_byte_index_from_char_index(string, index)
unless bi = Primitive.character_index_to_byte_index(string, index)
raise IndexError, "unable to find character at: #{index}"
end

Expand All @@ -335,10 +335,10 @@ def self.assign_index(string, index, count, replacement)
if total >= string.size
bs = string.bytesize - bi
else
bs = Primitive.string_byte_index_from_char_index(string, total) - bi
bs = Primitive.character_index_to_byte_index(string, total) - bi
end
else
bs = index == string.size ? 0 : Primitive.string_byte_index_from_char_index(string, index + 1) - bi
bs = index == string.size ? 0 : Primitive.character_index_to_byte_index(string, index + 1) - bi
end

replacement = StringValue replacement
Expand All @@ -364,15 +364,15 @@ def self.assign_range(string, index, replacement)

raise RangeError, "#{index.first} is out of range" if start < 0 or start > string.size

bi = Primitive.string_byte_index_from_char_index(string, start)
bi = Primitive.character_index_to_byte_index(string, start)
raise IndexError, "unable to find character at: #{start}" unless bi

if stop < start
bs = 0
elsif stop >= string.size
bs = string.bytesize - bi
else
bs = Primitive.string_byte_index_from_char_index(string, stop + 1) - bi
bs = Primitive.character_index_to_byte_index(string, stop + 1) - bi
end

replacement = StringValue replacement
Expand Down Expand Up @@ -406,8 +406,8 @@ def self.assign_regexp(string, index, count, replacement)
replacement = StringValue replacement
enc = Primitive.encoding_ensure_compatible_str string, replacement

bi = Primitive.string_byte_index_from_char_index(string, match.begin(count))
bs = Primitive.string_byte_index_from_char_index(string, match.end(count)) - bi
bi = Primitive.character_index_to_byte_index(string, match.begin(count))
bs = Primitive.character_index_to_byte_index(string, match.end(count)) - bi

Primitive.string_splice(string, replacement, bi, bs, enc)
end
Expand Down

0 comments on commit c83ae6d

Please sign in to comment.