Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make IO#read_char's default behaviour UTF-8-strict #10446

Merged
merged 4 commits into from
Aug 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions spec/std/io/io_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -281,15 +281,29 @@ describe IO do
io.read_char.should eq('界')
io.read_char.should be_nil

io.write Bytes[0xf8, 0xff, 0xff, 0xff]
expect_raises(InvalidByteSequenceError) do
io.read_char
end

io.write_byte 0x81_u8
expect_raises(InvalidByteSequenceError) do
io.read_char
end
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc4, 0x70]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc4, 0x70, 0x00, 0x00]).read_char }

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf8]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf8, 0x00, 0x00, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0x81]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0x81, 0x00, 0x00, 0x00]).read_char }

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xa0, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xa0, 0x80, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xbf, 0xbf]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xbf, 0xbf, 0x00]).read_char }

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc0, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc0, 0x80, 0x00, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc1, 0xbf]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc1, 0xbf, 0x00, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x80, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x80, 0x80, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x9f, 0xbf]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x9f, 0xbf, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf0, 0x80, 0x80, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf0, 0x8f, 0xbf, 0xbf]).read_char }
end

it "reads byte" do
Expand Down
2 changes: 2 additions & 0 deletions src/char/reader.cr
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ struct Char
end
end

# :nodoc:
# See also: `IO#read_char_with_bytesize`.
private def decode_char_at(pos, & : UInt32, Int32, UInt8? ->)
first = byte_at(pos)
if first < 0x80
Expand Down
99 changes: 48 additions & 51 deletions src/io.cr
Original file line number Diff line number Diff line change
Expand Up @@ -296,80 +296,77 @@ abstract class IO
# io.read_char # => nil
# ```
def read_char : Char?
info = read_char_with_bytesize
peek = self.peek unless decoder
info = read_char_with_bytesize(peek)
info ? info[0] : nil
end

private def read_char_with_bytesize
# For UTF-8 encoding, try to see if we can peek 4 bytes.
# If so, this will be faster than reading byte per byte.
if !decoder && (peek = self.peek)
if peek.empty?
return nil
else
return read_char_with_bytesize_peek(peek)
end
else
read_char_with_bytesize_slow
end
end
# :nodoc:
# See also: `Char::Reader#decode_char_at`.
private def read_char_with_bytesize(peek = nil)
first = peek_or_read_utf8(peek, 0)
return nil unless first
first = first.to_u32

private def read_char_with_bytesize_peek(peek)
first = peek[0].to_u32
skip(1)
if first < 0x80
return first.unsafe_chr, 1
end

second = peek_or_read_masked(peek, 1)
if first < 0xe0
return ((first & 0x1f) << 6 | second).unsafe_chr, 2
if first < 0xc2
raise InvalidByteSequenceError.new("Unexpected byte 0x#{first.to_s(16)} in UTF-8 byte sequence")
end

third = peek_or_read_masked(peek, 2)
if first < 0xf0
return ((first & 0x0f) << 12 | (second << 6) | third).unsafe_chr, 3
end
second = peek_or_read_utf8_masked(peek, 1)

fourth = peek_or_read_masked(peek, 3)
if first < 0xf8
return ((first & 0x07) << 18 | (second << 12) | (third << 6) | fourth).unsafe_chr, 4
if first < 0xe0
return ((first << 6) &+ (second &- 0x3080)).unsafe_chr, 2
end

raise InvalidByteSequenceError.new("Unexpected byte 0x#{first.to_s(16)} in UTF-8 byte sequence")
end
third = peek_or_read_utf8_masked(peek, 2)

private def read_char_with_bytesize_slow
first = read_utf8_byte
return nil unless first
if first < 0xf0
if first == 0xe0 && second < 0xa0
raise InvalidByteSequenceError.new("Overlong UTF-8 encoding")
end

first = first.to_u32
return first.unsafe_chr, 1 if first < 0x80
if first == 0xed && second >= 0xa0
raise InvalidByteSequenceError.new("Invalid UTF-8 codepoint")
end

second = read_utf8_masked_byte
return ((first & 0x1f) << 6 | second).unsafe_chr, 2 if first < 0xe0
return ((first << 12) &+ (second << 6) &+ (third &- 0xE2080)).unsafe_chr, 3
end

third = read_utf8_masked_byte
return ((first & 0x0f) << 12 | (second << 6) | third).unsafe_chr, 3 if first < 0xf0
if first < 0xf5
if first == 0xf0 && second < 0x90
raise InvalidByteSequenceError.new("Overlong UTF-8 encoding")
end

fourth = read_utf8_masked_byte
return ((first & 0x07) << 18 | (second << 12) | (third << 6) | fourth).unsafe_chr, 4 if first < 0xf8
if first == 0xf4 && second >= 0x90
raise InvalidByteSequenceError.new("Invalid UTF-8 codepoint")
end

raise InvalidByteSequenceError.new("Unexpected byte 0x#{first.to_s(16)} in UTF-8 byte sequence")
end
fourth = peek_or_read_utf8_masked(peek, 3)
return ((first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080)).unsafe_chr, 4
end

private def read_utf8_masked_byte
byte = read_utf8_byte || raise InvalidByteSequenceError.new("Incomplete UTF-8 byte sequence")
(byte & 0x3f).to_u32
raise InvalidByteSequenceError.new("Unexpected byte 0x#{first.to_s(16)} in UTF-8 byte sequence")
end

private def peek_or_read_masked(peek, index)
if byte = peek[index]?
private def peek_or_read_utf8(peek, index)
if peek && (byte = peek[index]?)
skip(1)
(byte & 0x3f).to_u32
byte
else
read_utf8_masked_byte
read_utf8_byte
end
end

private def peek_or_read_utf8_masked(peek, index)
byte = peek_or_read_utf8(peek, index) || raise InvalidByteSequenceError.new("Incomplete UTF-8 byte sequence")
if (byte & 0xc0) != 0x80
raise InvalidByteSequenceError.new("Unexpected continuation byte 0x#{byte.to_s(16)} in UTF-8 byte sequence")
end
byte.to_u32
end

# Reads a single decoded UTF-8 byte from this `IO`.
Expand Down Expand Up @@ -732,7 +729,7 @@ abstract class IO
buffer = String::Builder.new
total = 0
while true
info = read_char_with_bytesize_slow
info = read_char_with_bytesize
unless info
return buffer.empty? ? nil : buffer.to_s
end
Expand All @@ -741,7 +738,7 @@ abstract class IO

# Consider the case of \r\n when the delimiter is \n and chomp = true
if chomp_rn && char == '\r'
info2 = read_char_with_bytesize_slow
info2 = read_char_with_bytesize
unless info2
buffer << char
break
Expand Down