Skip to content

Commit

Permalink
Fix String#index and #rindex for Char::REPLACEMENT (#14937)
Browse files Browse the repository at this point in the history
If the string consists only of ASCII characters and invalid UTF-8 byte sequences, all the latter should correspond to `Char::REPLACEMENT`, and so `#index` and `#rindex` should detect them, but this was broken since #14461.
  • Loading branch information
HertzDevil committed Aug 25, 2024
1 parent cc0dfc1 commit 0ad3e91
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 8 deletions.
6 changes: 6 additions & 0 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -957,13 +957,16 @@ describe "String" do
it { "日本語".index('本').should eq(1) }
it { "bar".index('あ').should be_nil }
it { "あいう_えお".index('_').should eq(3) }
it { "xyz\xFFxyz".index('\u{FFFD}').should eq(3) }

describe "with offset" do
it { "foobarbaz".index('a', 5).should eq(7) }
it { "foobarbaz".index('a', -4).should eq(7) }
it { "foo".index('g', 1).should be_nil }
it { "foo".index('g', -20).should be_nil }
it { "日本語日本語".index('本', 2).should eq(4) }
it { "xyz\xFFxyz".index('\u{FFFD}', 2).should eq(3) }
it { "xyz\xFFxyz".index('\u{FFFD}', 4).should be_nil }

# Check offset type
it { "foobarbaz".index('a', 5_i64).should eq(7) }
Expand Down Expand Up @@ -1106,6 +1109,7 @@ describe "String" do
it { "foobar".rindex('g').should be_nil }
it { "日本語日本語".rindex('本').should eq(4) }
it { "あいう_えお".rindex('_').should eq(3) }
it { "xyz\xFFxyz".rindex('\u{FFFD}').should eq(3) }

describe "with offset" do
it { "bbbb".rindex('b', 2).should eq(2) }
Expand All @@ -1118,6 +1122,8 @@ describe "String" do
it { "faobar".rindex('a', 3).should eq(1) }
it { "faobarbaz".rindex('a', -3).should eq(4) }
it { "日本語日本語".rindex('本', 3).should eq(1) }
it { "xyz\xFFxyz".rindex('\u{FFFD}', 4).should eq(3) }
it { "xyz\xFFxyz".rindex('\u{FFFD}', 2).should be_nil }

# Check offset type
it { "bbbb".rindex('b', 2_i64).should eq(2) }
Expand Down
36 changes: 28 additions & 8 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -3349,11 +3349,21 @@ class String
def index(search : Char, offset = 0) : Int32?
# If it's ASCII we can delegate to slice
if single_byte_optimizable?
# With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte
# sequences and we can immediately reject any non-ASCII codepoint.
return unless search.ascii?
# With `single_byte_optimizable?` there are only ASCII characters and
# invalid UTF-8 byte sequences, and we can reject anything that is neither
# ASCII nor the replacement character.
case search
when .ascii?
return to_slice.fast_index(search.ord.to_u8!, offset)
when Char::REPLACEMENT
offset.upto(bytesize - 1) do |i|
if to_unsafe[i] >= 0x80
return i.to_i
end
end
end

return to_slice.fast_index(search.ord.to_u8, offset)
return nil
end

offset += size if offset < 0
Expand Down Expand Up @@ -3469,11 +3479,21 @@ class String
def rindex(search : Char, offset = size - 1)
# If it's ASCII we can delegate to slice
if single_byte_optimizable?
# With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte
# sequences and we can immediately reject any non-ASCII codepoint.
return unless search.ascii?
# With `single_byte_optimizable?` there are only ASCII characters and
# invalid UTF-8 byte sequences, and we can reject anything that is neither
# ASCII nor the replacement character.
case search
when .ascii?
return to_slice.rindex(search.ord.to_u8!, offset)
when Char::REPLACEMENT
offset.downto(0) do |i|
if to_unsafe[i] >= 0x80
return i.to_i
end
end
end

return to_slice.rindex(search.ord.to_u8, offset)
return nil
end

offset += size if offset < 0
Expand Down

0 comments on commit 0ad3e91

Please sign in to comment.