From 6f3b495fd8437def303dfed775e46f757afc54c3 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Sat, 24 Aug 2024 00:29:33 +0800 Subject: [PATCH] Fix `String#index` and `#rindex` for replacement character --- spec/std/string_spec.cr | 6 ++++++ src/string.cr | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index 6bb4bd2c0c62..5b70deda13c3 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -957,6 +957,7 @@ describe "String" do it { "日本語".index('本').should eq(1) } it { "bar".index('あ').should be_nil } it { "あいう_えお".index('_').should eq(3) } + it { "xyz\xFFxyz".index('\u{FFFD}').should eq(3) } describe "with offset" do it { "foobarbaz".index('a', 5).should eq(7) } @@ -964,6 +965,8 @@ describe "String" do it { "foo".index('g', 1).should be_nil } it { "foo".index('g', -20).should be_nil } it { "日本語日本語".index('本', 2).should eq(4) } + it { "xyz\xFFxyz".index('\u{FFFD}', 2).should eq(3) } + it { "xyz\xFFxyz".index('\u{FFFD}', 4).should be_nil } # Check offset type it { "foobarbaz".index('a', 5_i64).should eq(7) } @@ -1106,6 +1109,7 @@ describe "String" do it { "foobar".rindex('g').should be_nil } it { "日本語日本語".rindex('本').should eq(4) } it { "あいう_えお".rindex('_').should eq(3) } + it { "xyz\xFFxyz".rindex('\u{FFFD}').should eq(3) } describe "with offset" do it { "bbbb".rindex('b', 2).should eq(2) } @@ -1118,6 +1122,8 @@ describe "String" do it { "faobar".rindex('a', 3).should eq(1) } it { "faobarbaz".rindex('a', -3).should eq(4) } it { "日本語日本語".rindex('本', 3).should eq(1) } + it { "xyz\xFFxyz".rindex('\u{FFFD}', 4).should eq(3) } + it { "xyz\xFFxyz".rindex('\u{FFFD}', 2).should be_nil } # Check offset type it { "bbbb".rindex('b', 2_i64).should eq(2) } diff --git a/src/string.cr b/src/string.cr index cf96401253b8..35c33b903939 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3349,11 +3349,21 @@ class String def index(search : Char, offset = 0) : Int32? # If it's ASCII we can delegate to slice if single_byte_optimizable? - # With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte - # sequences and we can immediately reject any non-ASCII codepoint. - return unless search.ascii? + # With `single_byte_optimizable?` there are only ASCII characters and + # invalid UTF-8 byte sequences, and we can reject anything that is neither + # ASCII nor the replacement character. + case search + when .ascii? + return to_slice.fast_index(search.ord.to_u8!, offset) + when Char::REPLACEMENT + offset.upto(bytesize - 1) do |i| + if to_unsafe[i] >= 0x80 + return i.to_i + end + end + end - return to_slice.fast_index(search.ord.to_u8, offset) + return nil end offset += size if offset < 0 @@ -3469,11 +3479,21 @@ class String def rindex(search : Char, offset = size - 1) # If it's ASCII we can delegate to slice if single_byte_optimizable? - # With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte - # sequences and we can immediately reject any non-ASCII codepoint. - return unless search.ascii? + # With `single_byte_optimizable?` there are only ASCII characters and + # invalid UTF-8 byte sequences, and we can reject anything that is neither + # ASCII nor the replacement character. + case search + when .ascii? + return to_slice.rindex(search.ord.to_u8!, offset) + when Char::REPLACEMENT + offset.downto(0) do |i| + if to_unsafe[i] >= 0x80 + return i.to_i + end + end + end - return to_slice.rindex(search.ord.to_u8, offset) + return nil end offset += size if offset < 0