Fix #10959 problems with UTF-8 conversions

JuliaLang · Jun 11, 2015 · f65c6df · f65c6df
1 parent 3df6617
commit f65c6df
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 48 deletions.
diff --git a/base/utf8.jl b/base/utf8.jl
@@ -3,7 +3,7 @@
 ## from base/boot.jl:
 #
 # immutable UTF8String <: AbstractString
-#     data::Array{UInt8,1}
+#     data::Vector{UInt8}
 # end
 #
 
@@ -26,27 +26,23 @@ const utf8_trailing = [
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
 ]
 
-is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)
-
 ## required core functionality ##
 
 function endof(s::UTF8String)
     d = s.data
     i = length(d)
     i == 0 && return i
-    while !is_utf8_start(d[i])
+    while is_valid_continuation(d[i])
         i -= 1
     end
     i
 end
 
-is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)
-
 function length(s::UTF8String)
     d = s.data
     cnum = 0
     for i = 1:length(d)
-        @inbounds cnum += !is_utf8_continuation(d[i])
+        @inbounds cnum += !is_valid_continuation(d[i])
     end
     cnum
 end
@@ -65,8 +61,17 @@ function next(s::UTF8String, i::Int)
 
     d = s.data
     b = d[i]
-    if !is_utf8_start(b)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+    if is_valid_continuation(b)
+        j = i-1
+        while 0 < j && is_valid_continuation(d[j])
+            j -= 1
+        end
+        if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
+            # b is a continuation byte of a valid UTF-8 character
+            throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
+        end
+        # move past 1 byte in case the data is actually Latin-1
+        return '\ufffd', i+1
     end
     trailing = utf8_trailing[b+1]
     if length(d) < i + trailing
@@ -93,7 +98,7 @@ end
 function reverseind(s::UTF8String, i::Integer)
     j = lastidx(s) + 1 - i
     d = s.data
-    while !is_utf8_start(d[j])
+    while is_valid_continuation(d[j])
         j -= 1
     end
     return j
@@ -106,19 +111,16 @@ sizeof(s::UTF8String) = sizeof(s.data)
 lastidx(s::UTF8String) = length(s.data)
 
 isvalid(s::UTF8String, i::Integer) =
-    (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
+    (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
 
 const empty_utf8 = UTF8String(UInt8[])
 
 function getindex(s::UTF8String, r::UnitRange{Int})
     isempty(r) && return empty_utf8
     i, j = first(r), last(r)
     d = s.data
-    if i < 1 || i > length(s.data)
-        throw(BoundsError(s, i))
-    end
-    if !is_utf8_start(d[i])
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+    if is_valid_continuation(d[i])
+        i = nextind(s,i)
     end
     if j > length(d)
         throw(BoundsError())
@@ -214,20 +216,86 @@ write(io::IO, s::UTF8String) = write(io, s.data)
 utf8(x) = convert(UTF8String, x)
 convert(::Type{UTF8String}, s::UTF8String) = s
 convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
-convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
-function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
+
+"
+Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
+
+### Input Arguments:
+*   `::Type{UTF8String}`
+*   `dat::Vector{UInt8}`
+
+### Returns:
+*   `::UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, dat::Vector{UInt8})
+    # handle zero length string quickly
+    isempty(dat) && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = check_string(dat)
+    if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
+        len = sizeof(dat)
+        @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    end
+    # Copy, but eliminate over-long encodings and surrogate pairs
+    len += num2byte + num3byte*2 + num4byte*3
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle overlong < 0x100
+        elseif ch < 0xc2
+            buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle 0x100-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+        elseif ch != 0xed
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+            buf[out += 1] = dat[pos += 1]
+            # Copy 4-byte encoded value
+            ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
+        # Handle surrogate pairs
+        else
+            ch = dat[pos += 1]
+            if ch < 0xa0 # not surrogate pairs
+                buf[out += 1] = 0xed
+                buf[out += 1] = ch
+                buf[out += 1] = dat[pos += 1]
+            else
+                # Pick up surrogate pairs (CESU-8 format)
+                ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+                        + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
+                        - 0xc00)
+                pos += 4
+                output_utf8_4byte!(buf, out, ch)
+                out += 4
+            end
+        end
+    end
+    UTF8String(buf)
+end
+
+function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
     l = length(a)
     idx = 1
     iscopy = false
     while idx <= l
-        if is_utf8_start(a[idx])
+        if !is_valid_continuation(a[idx])
             nextidx = idx+1+utf8_trailing[a[idx]+1]
             (nextidx <= (l+1)) && (idx = nextidx; continue)
         end
         !iscopy && (a = copy(a); iscopy = true)
         endn = idx
         while endn <= l
-            is_utf8_start(a[endn]) && break
+            !is_valid_continuation(a[endn]) && break
             endn += 1
         end
         (endn > idx) && (endn -= 1)

diff --git a/test/strings.jl b/test/strings.jl
@@ -1857,8 +1857,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
 str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
 strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
 strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
-strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
 strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
+strZ      = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80"
 
 strA_UTF16 = utf16(strA_UTF8)
 strL_UTF16 = utf16(strL_UTF8)
@@ -1893,97 +1893,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
 @test utf16(strS_UTF32) == strC_UTF8
 
 # Test converting overlong \0
-# @test utf8(strZ_UTF8)  == strz_UTF8   # currently broken! (in utf8.jl)
-@test utf16(strZ_UTF8) == strz_UTF8
-@test utf32(strZ_UTF8) == strz_UTF8
+@test utf8(strZ)  == strz_UTF8
+@test utf16(UTF8String(strZ)) == strz_UTF8
+@test utf32(UTF8String(strZ)) == strz_UTF8
 
 # Test invalid sequences
 
+@inline strval(::Type{UTF8String}, dat) = dat
+@inline strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat)
+
 byt = 0x0
-for T in (UTF16String, UTF32String)
+for T in (UTF8String, UTF16String, UTF32String)
     try
     # Continuation byte not after lead
     for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T,  UTF8String(UInt8[byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt]))
     end
 
     # Test lead bytes
     for byt in 0xc0:0xff
         # Single lead byte at end of string
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt]))
         # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0]))
         # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0xc0]))
     end
 
     # Test overlong 2-byte
     for byt in 0x81:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xc0,byt]))
     end
     for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xc1,byt]))
     end
 
     # Test overlong 3-byte
     for byt in 0x80:0x9f
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xe0,byt,0x80]))
     end
 
     # Test overlong 4-byte
     for byt in 0x80:0x8f
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xef,byt,0x80,0x80]))
     end
 
     # Test 4-byte > 0x10ffff
     for byt in 0x90:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[0xf4,byt,0x80,0x80]))
     end
     for byt in 0xf5:0xf7
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80]))
     end
 
     # Test 5-byte
     for byt in 0xf8:0xfb
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80]))
     end
 
     # Test 6-byte
     for byt in 0xfc:0xfd
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
     end
 
     # Test 7-byte
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
 
     # Three and above byte sequences
     for byt in 0xe0:0xef
         # Lead followed by only 1 continuation byte
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80]))
         # Lead ended by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0]))
         # Lead ended by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0xc0]))
     end
 
     # 3-byte encoded surrogate character(s)
     # Single surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80]))
     # Not followed by surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
     # Trailing surrogate first
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
     # Followed by lead surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T,UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
 
     # Four byte sequences
     for byt in 0xf0:0xf4
         # Lead followed by only 2 continuation bytes
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80]))
         # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0]))
         # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T,UInt8[byt,0x80,0x80,0xc0]))
     end
     catch exp ;
         println("Error checking $T: $byt")