From ffe1a0789a338a2b4e5199965319194ec4f6ab90 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Mon, 17 Jul 2023 10:16:00 -0400 Subject: [PATCH] read(io, Char): fix read with too many leading ones (#50552) Fixes #50532. The `read(io, Char)` method didn't correctly handle the case where the lead byte starts with too many leading ones; this fix makes it handle that case correctly, which makes `read(io, Char)` match `collect(s)` in its interpretation of what a character is in all invalid cases. Also fix and test `read(::File, Char)` which has the same bug. --- base/filesystem.jl | 5 +++-- base/io.jl | 4 ++-- test/char.jl | 30 +++++++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/base/filesystem.jl b/base/filesystem.jl index 63fe4281f6e59..d291dd3b31630 100644 --- a/base/filesystem.jl +++ b/base/filesystem.jl @@ -200,11 +200,12 @@ end function read(f::File, ::Type{Char}) b0 = read(f, UInt8) - l = 8 * (4 - leading_ones(b0)) + l = 0x08 * (0x04 - UInt8(leading_ones(b0))) c = UInt32(b0) << 24 - if l < 24 + if l ≤ 0x10 s = 16 while s ≥ l && !eof(f) + # this works around lack of peek(::File) p = position(f) b = read(f, UInt8) if b & 0xc0 != 0x80 diff --git a/base/io.jl b/base/io.jl index 896702a0b2938..be43a954c621e 100644 --- a/base/io.jl +++ b/base/io.jl @@ -884,9 +884,9 @@ end function read(io::IO, ::Type{Char}) b0 = read(io, UInt8)::UInt8 - l = 8(4-leading_ones(b0)) + l = 0x08 * (0x04 - UInt8(leading_ones(b0))) c = UInt32(b0) << 24 - if l < 24 + if l ≤ 0x10 s = 16 while s ≥ l && !eof(io)::Bool peek(io) & 0xc0 == 0x80 || break diff --git a/test/char.jl b/test/char.jl index 1639c62ec819d..1d3579013ad18 100644 --- a/test/char.jl +++ b/test/char.jl @@ -1,7 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license @testset "basic properties" begin - @test typemax(Char) == reinterpret(Char, typemax(UInt32)) @test typemin(Char) == Char(0) @test typemax(Char) == reinterpret(Char, 0xffffffff) @@ -214,6 +213,35 @@ end end end +# issue #50532 +@testset "invalid read(io, Char)" begin + # byte values with different numbers of leading bits + B = UInt8[ + 0x3f, 0x4d, 0x52, 0x63, 0x81, 0x83, 0x89, 0xb6, + 0xc0, 0xc8, 0xd3, 0xe3, 0xea, 0xeb, 0xf0, 0xf2, + 0xf4, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + ] + f = tempname() + for b1 in B, b2 in B, t = 0:3 + bytes = [b1, b2] + append!(bytes, rand(B, t)) + s = String(bytes) + write(f, s) + @test s == read(f, String) + chars = collect(s) + ios = [IOBuffer(s), open(f), Base.Filesystem.open(f, 0)] + for io in ios + chars′ = Char[] + while !eof(io) + push!(chars′, read(io, Char)) + end + @test chars == chars′ + close(io) + end + end + rm(f) +end + @testset "overlong codes" begin function test_overlong(c::Char, n::Integer, rep::String) if isvalid(c)