Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Int#chr reject surrogate halves #10451

Merged
merged 5 commits into from
Aug 24, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion spec/std/base64_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ describe "Base64" do

it "works for most characters" do
a = String.build(65536 * 4) do |buf|
65536.times { |i| buf << (i + 1).chr }
65536.times { |i| buf << (i + 1).unsafe_chr }
end
b = Base64.encode(a)
Crystal::Digest::MD5.hexdigest(Base64.decode_string(b)).should eq(Crystal::Digest::MD5.hexdigest(a))
Expand Down
12 changes: 0 additions & 12 deletions spec/std/char_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,6 @@ describe "Char" do
it "does for unicode" do
'青'.bytesize.should eq(3)
end

it "raises on codepoint bigger than 0x10ffff" do
expect_raises InvalidByteSequenceError do
(0x10ffff + 1).unsafe_chr.bytesize
end
end
end

describe "in_set?" do
Expand Down Expand Up @@ -338,12 +332,6 @@ describe "Char" do
end
end

it "raises on codepoint bigger than 0x10ffff when doing each_byte" do
expect_raises InvalidByteSequenceError do
(0x10ffff + 1).unsafe_chr.each_byte { |b| }
end
end

it "does each_byte" do
'a'.each_byte(&.should eq('a'.ord)).should be_nil
end
Expand Down
10 changes: 9 additions & 1 deletion spec/std/int_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -799,9 +799,17 @@ describe "Int" do
it "#chr" do
65.chr.should eq('A')

expect_raises(ArgumentError, "#{0x10ffff + 1} out of char range") do
expect_raises(ArgumentError, "0x110000 out of char range") do
(0x10ffff + 1).chr
end

expect_raises(ArgumentError, "0xd800 out of char range") do
0xd800.chr
end

expect_raises(ArgumentError, "0xdfff out of char range") do
0xdfff.chr
end
end

it "#unsafe_chr" do
Expand Down
8 changes: 2 additions & 6 deletions src/char.cr
Original file line number Diff line number Diff line change
Expand Up @@ -717,14 +717,12 @@ struct Char
yield (0xe0 | (c >> 12)).to_u8
yield (0x80 | ((c >> 6) & 0x3f)).to_u8
yield (0x80 | (c & 0x3f)).to_u8
elsif c <= MAX_CODEPOINT
else
# 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
yield (0xf0 | (c >> 18)).to_u8
yield (0x80 | ((c >> 12) & 0x3f)).to_u8
yield (0x80 | ((c >> 6) & 0x3f)).to_u8
yield (0x80 | (c & 0x3f)).to_u8
else
raise InvalidByteSequenceError.new("Invalid char value #{dump}")
end
end

Expand All @@ -747,11 +745,9 @@ struct Char
elsif c <= 0xffff
# 1110xxxx 10xxxxxx 10xxxxxx
3
elsif c <= MAX_CODEPOINT
else
# 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
4
else
raise InvalidByteSequenceError.new("Invalid char value #{dump}")
end
end

Expand Down
2 changes: 1 addition & 1 deletion src/http/common.cr
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ module HTTP
String.build do |io|
while quoted_pair_index
io.write(data[0, quoted_pair_index])
io << data[quoted_pair_index + 1].chr
io << data[quoted_pair_index + 1].unsafe_chr

data += quoted_pair_index + 2
quoted_pair_index = data.index('\\'.ord)
Expand Down
7 changes: 4 additions & 3 deletions src/int.cr
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,15 @@ struct Int

# Returns a `Char` that has the unicode codepoint of `self`.
#
# Raises `ArgumentError` if this integer's value doesn't fit a char's range (`0..0x10ffff`).
# Raises `ArgumentError` if this integer's value doesn't fit a char's range
# (`0..0xd7ff` and `0xe000..0x10ffff`).
#
# ```
# 97.chr # => 'a'
# ```
def chr : Char
unless 0 <= self <= Char::MAX_CODEPOINT
raise ArgumentError.new("#{self} out of char range")
unless 0 <= self <= 0xd7ff || 0xe000 <= self <= Char::MAX_CODEPOINT
raise ArgumentError.new("0x#{self.to_s(16)} out of char range")
end
unsafe_chr
end
Expand Down
2 changes: 1 addition & 1 deletion src/primitives.cr
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ end
struct {{int.id}}
# Returns a `Char` that has the unicode codepoint of `self`,
# without checking if this integer is in the range valid for
# chars (`0..0x10ffff`).
# chars (`0..0xd7ff` and `0xe000..0x10ffff`).
#
# You should never use this method unless `chr` turns out to
# be a bottleneck.
Expand Down
12 changes: 6 additions & 6 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -712,18 +712,18 @@ class String
unless v.finite?
startptr = to_unsafe
if whitespace
while startptr.value.chr.ascii_whitespace?
while startptr.value.unsafe_chr.ascii_whitespace?
startptr += 1
end
end
if startptr.value.chr.in?('+', '-')
if startptr.value.unsafe_chr.in?('+', '-')
startptr += 1
end

if v.nan?
return unless startptr.value.chr.in?('n', 'N')
return unless startptr.value.unsafe_chr.in?('n', 'N')
else
return unless startptr.value.chr.in?('i', 'I')
return unless startptr.value.unsafe_chr.in?('i', 'I')
end
end

Expand All @@ -734,7 +734,7 @@ class String

if strict
if whitespace
while endptr < string_end && endptr.value.chr.ascii_whitespace?
while endptr < string_end && endptr.value.unsafe_chr.ascii_whitespace?
endptr += 1
end
end
Expand All @@ -743,7 +743,7 @@ class String
else
ptr = to_unsafe
if whitespace
while ptr < string_end && ptr.value.chr.ascii_whitespace?
while ptr < string_end && ptr.value.unsafe_chr.ascii_whitespace?
ptr += 1
end
end
Expand Down