Skip to content

Commit

Permalink
Merge pull request #11624 from ScottPJones/spj/fixutf8
Browse files Browse the repository at this point in the history
Fix #10959, fix #11463 bugs with UTF-8 conversions
  • Loading branch information
StefanKarpinski committed Jul 28, 2015
2 parents 88bb2e9 + 91305f7 commit 416a23e
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 43 deletions.
92 changes: 77 additions & 15 deletions base/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## from base/boot.jl:
#
# immutable UTF8String <: AbstractString
# data::Array{UInt8,1}
# data::Vector{UInt8}
# end
#

Expand All @@ -26,6 +26,8 @@ const utf8_trailing = [
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]

# Retained because although undocumented and unexported, used in a package (MutableStrings)
# should be deprecated
is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)

## required core functionality ##
Expand All @@ -34,19 +36,17 @@ function endof(s::UTF8String)
d = s.data
i = length(d)
i == 0 && return i
while !is_utf8_start(d[i])
while is_valid_continuation(d[i])
i -= 1
end
i
end

is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)

function length(s::UTF8String)
d = s.data
cnum = 0
for i = 1:length(d)
@inbounds cnum += !is_utf8_continuation(d[i])
@inbounds cnum += !is_valid_continuation(d[i])
end
cnum
end
Expand All @@ -65,7 +65,7 @@ function next(s::UTF8String, i::Int)

d = s.data
b = d[i]
if !is_utf8_start(b)
if is_valid_continuation(b)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
end
trailing = utf8_trailing[b+1]
Expand Down Expand Up @@ -93,7 +93,7 @@ end
function reverseind(s::UTF8String, i::Integer)
j = lastidx(s) + 1 - i
d = s.data
while !is_utf8_start(d[j])
while is_valid_continuation(d[j])
j -= 1
end
return j
Expand All @@ -106,7 +106,7 @@ sizeof(s::UTF8String) = sizeof(s.data)
lastidx(s::UTF8String) = length(s.data)

isvalid(s::UTF8String, i::Integer) =
(1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])

const empty_utf8 = UTF8String(UInt8[])

Expand All @@ -133,7 +133,7 @@ function search(s::UTF8String, c::Char, i::Integer)
throw(BoundsError(s, i))
end
d = s.data
if !is_utf8_start(d[i])
if is_valid_continuation(d[i])
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
end
c < Char(0x80) && return search(d, c%UInt8, i)
Expand Down Expand Up @@ -216,20 +216,82 @@ convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{SubString{UTF8String}}, s::SubString{ASCIIString}) =
SubString(utf8(s.string), s.offset+1, s.endof+s.offset)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)

"""
Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"""
function convert(::Type{UTF8String}, dat::Vector{UInt8})
# handle zero length string quickly
isempty(dat) && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
len = sizeof(dat)
@inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
end
# Copy, but eliminate over-long encodings and surrogate pairs
len += num2byte + num3byte*2 + num4byte*3
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle overlong < 0x100
elseif ch < 0xc2
buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
# Handle 0x100-0x7ff
elseif ch < 0xe0
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
elseif ch != 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
buf[out += 1] = dat[pos += 1]
# Copy 4-byte encoded value
ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
# Handle surrogate pairs
else
ch = dat[pos += 1]
if ch < 0xa0 # not surrogate pairs
buf[out += 1] = 0xed
buf[out += 1] = ch
buf[out += 1] = dat[pos += 1]
else
# Pick up surrogate pairs (CESU-8 format)
ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+ (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
- 0xc00)
pos += 4
output_utf8_4byte!(buf, out, ch)
out += 4
end
end
end
UTF8String(buf)
end

function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
l = length(a)
idx = 1
iscopy = false
while idx <= l
if is_utf8_start(a[idx])
if !is_valid_continuation(a[idx])
nextidx = idx+1+utf8_trailing[a[idx]+1]
(nextidx <= (l+1)) && (idx = nextidx; continue)
end
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
is_utf8_start(a[endn]) && break
!is_valid_continuation(a[endn]) && break
endn += 1
end
(endn > idx) && (endn -= 1)
Expand All @@ -240,7 +302,7 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr
end
convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s))

"
"""
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Input Arguments:
Expand All @@ -249,7 +311,7 @@ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Returns:
* `UTF8String`
"
"""
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
buf = Vector{UInt8}(len)
out = 0
Expand Down
59 changes: 31 additions & 28 deletions test/unicode/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
strZ = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80"

strA_UTF16 = utf16(strA_UTF8)
strL_UTF16 = utf16(strL_UTF8)
Expand Down Expand Up @@ -68,97 +68,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
@test utf16(strS_UTF32) == strC_UTF8

# Test converting overlong \0
# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl)
@test utf16(strZ_UTF8) == strz_UTF8
@test utf32(strZ_UTF8) == strz_UTF8
@test utf8(strZ) == strz_UTF8
@test utf16(UTF8String(strZ)) == strz_UTF8
@test utf32(UTF8String(strZ)) == strz_UTF8

# Test invalid sequences

strval(::Type{UTF8String}, dat) = dat
strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat)

byt = 0x0
for T in (UTF16String, UTF32String)
for T in (UTF8String, UTF16String, UTF32String)
try
# Continuation byte not after lead
for byt in 0x80:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt]))
end

# Test lead bytes
for byt in 0xc0:0xff
# Single lead byte at end of string
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt]))
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0]))
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0xc0]))
end

# Test overlong 2-byte
for byt in 0x81:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xc0,byt]))
end
for byt in 0x80:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xc1,byt]))
end

# Test overlong 3-byte
for byt in 0x80:0x9f
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xe0,byt,0x80]))
end

# Test overlong 4-byte
for byt in 0x80:0x8f
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xef,byt,0x80,0x80]))
end

# Test 4-byte > 0x10ffff
for byt in 0x90:0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xf4,byt,0x80,0x80]))
end
for byt in 0xf5:0xf7
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80]))
end

# Test 5-byte
for byt in 0xf8:0xfb
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80]))
end

# Test 6-byte
for byt in 0xfc:0xfd
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
end

# Test 7-byte
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))

# Three and above byte sequences
for byt in 0xe0:0xef
# Lead followed by only 1 continuation byte
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80]))
# Lead ended by non-continuation character < 0x80
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0]))
# Lead ended by non-continuation character > 0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0xc0]))
end

# 3-byte encoded surrogate character(s)
# Single surrogate
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80]))
# Not followed by surrogate
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
# Trailing surrogate first
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
# Followed by lead surrogate
@test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))

# Four byte sequences
for byt in 0xf0:0xf4
# Lead followed by only 2 continuation bytes
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80]))
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0]))
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
@test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0xc0]))
end
catch exp ;
println("Error checking $T: $byt")
Expand Down

0 comments on commit 416a23e

Please sign in to comment.