Merge pull request #11624 from ScottPJones/spj/fixutf8

Fix #10959, fix #11463 bugs with UTF-8 conversions
JuliaLang · Jul 28, 2015 · 416a23e · 416a23e
2 parents 88bb2e9 + 91305f7
commit 416a23e
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 43 deletions.
diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl
@@ -3,7 +3,7 @@
 ## from base/boot.jl:
 #
 # immutable UTF8String <: AbstractString
-#     data::Array{UInt8,1}
+#     data::Vector{UInt8}
 # end
 #
 
@@ -26,6 +26,8 @@ const utf8_trailing = [
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
 ]
 
+# Retained because although undocumented and unexported, used in a package (MutableStrings)
+# should be deprecated
 is_utf8_start(byte::UInt8) = ((byte&0xc0)!=0x80)
 
 ## required core functionality ##
@@ -34,19 +36,17 @@ function endof(s::UTF8String)
     d = s.data
     i = length(d)
     i == 0 && return i
-    while !is_utf8_start(d[i])
+    while is_valid_continuation(d[i])
         i -= 1
     end
     i
 end
 
-is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)
-
 function length(s::UTF8String)
     d = s.data
     cnum = 0
     for i = 1:length(d)
-        @inbounds cnum += !is_utf8_continuation(d[i])
+        @inbounds cnum += !is_valid_continuation(d[i])
     end
     cnum
 end
@@ -65,7 +65,7 @@ function next(s::UTF8String, i::Int)
 
     d = s.data
     b = d[i]
-    if !is_utf8_start(b)
+    if is_valid_continuation(b)
         throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
     end
     trailing = utf8_trailing[b+1]
@@ -93,7 +93,7 @@ end
 function reverseind(s::UTF8String, i::Integer)
     j = lastidx(s) + 1 - i
     d = s.data
-    while !is_utf8_start(d[j])
+    while is_valid_continuation(d[j])
         j -= 1
     end
     return j
@@ -106,7 +106,7 @@ sizeof(s::UTF8String) = sizeof(s.data)
 lastidx(s::UTF8String) = length(s.data)
 
 isvalid(s::UTF8String, i::Integer) =
-    (1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
+    (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
 
 const empty_utf8 = UTF8String(UInt8[])
 
@@ -133,7 +133,7 @@ function search(s::UTF8String, c::Char, i::Integer)
         throw(BoundsError(s, i))
     end
     d = s.data
-    if !is_utf8_start(d[i])
+    if is_valid_continuation(d[i])
         throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
     end
     c < Char(0x80) && return search(d, c%UInt8, i)
@@ -216,20 +216,82 @@ convert(::Type{UTF8String}, s::UTF8String) = s
 convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
 convert(::Type{SubString{UTF8String}}, s::SubString{ASCIIString}) =
     SubString(utf8(s.string), s.offset+1, s.endof+s.offset)
-convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
-function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
+
+"""
+Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"""
+function convert(::Type{UTF8String}, dat::Vector{UInt8})
+    # handle zero length string quickly
+    isempty(dat) && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
+    if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
+        len = sizeof(dat)
+        @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    end
+    # Copy, but eliminate over-long encodings and surrogate pairs
+    len += num2byte + num3byte*2 + num4byte*3
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle overlong < 0x100
+        elseif ch < 0xc2
+            buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle 0x100-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+        elseif ch != 0xed
+            buf[out += 1] = ch
+            buf[out += 1] = dat[pos += 1]
+            buf[out += 1] = dat[pos += 1]
+            # Copy 4-byte encoded value
+            ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
+        # Handle surrogate pairs
+        else
+            ch = dat[pos += 1]
+            if ch < 0xa0 # not surrogate pairs
+                buf[out += 1] = 0xed
+                buf[out += 1] = ch
+                buf[out += 1] = dat[pos += 1]
+            else
+                # Pick up surrogate pairs (CESU-8 format)
+                ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+                        + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
+                        - 0xc00)
+                pos += 4
+                output_utf8_4byte!(buf, out, ch)
+                out += 4
+            end
+        end
+    end
+    UTF8String(buf)
+end
+
+function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractString)
     l = length(a)
     idx = 1
     iscopy = false
     while idx <= l
-        if is_utf8_start(a[idx])
+        if !is_valid_continuation(a[idx])
             nextidx = idx+1+utf8_trailing[a[idx]+1]
             (nextidx <= (l+1)) && (idx = nextidx; continue)
         end
         !iscopy && (a = copy(a); iscopy = true)
         endn = idx
         while endn <= l
-            is_utf8_start(a[endn]) && break
+            !is_valid_continuation(a[endn]) && break
             endn += 1
         end
         (endn > idx) && (endn -= 1)
@@ -240,7 +302,7 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr
 end
 convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s))
 
-"
+"""
 Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
 
 ### Input Arguments:
@@ -249,7 +311,7 @@ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
 
 ### Returns:
 * `UTF8String`
-"
+"""
 function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
     buf = Vector{UInt8}(len)
     out = 0

diff --git a/test/unicode/utf32.jl b/test/unicode/utf32.jl
@@ -32,8 +32,8 @@ str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
 str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
 strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
 strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
-strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
 strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
+strZ      = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80"
 
 strA_UTF16 = utf16(strA_UTF8)
 strL_UTF16 = utf16(strL_UTF8)
@@ -68,97 +68,100 @@ tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
 @test utf16(strS_UTF32) == strC_UTF8
 
 # Test converting overlong \0
-# @test utf8(strZ_UTF8)  == strz_UTF8   # currently broken! (in utf8.jl)
-@test utf16(strZ_UTF8) == strz_UTF8
-@test utf32(strZ_UTF8) == strz_UTF8
+@test utf8(strZ)  == strz_UTF8
+@test utf16(UTF8String(strZ)) == strz_UTF8
+@test utf32(UTF8String(strZ)) == strz_UTF8
 
 # Test invalid sequences
 
+strval(::Type{UTF8String}, dat) = dat
+strval(::Union(Type{UTF16String},Type{UTF32String}), dat) = UTF8String(dat)
+
 byt = 0x0
-for T in (UTF16String, UTF32String)
+for T in (UTF8String, UTF16String, UTF32String)
     try
     # Continuation byte not after lead
     for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T,  UTF8String(UInt8[byt]))
+        @test_throws UnicodeError convert(T,  strval(T, UInt8[byt]))
     end
 
     # Test lead bytes
     for byt in 0xc0:0xff
         # Single lead byte at end of string
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt]))
         # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0]))
         # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0xc0]))
     end
 
     # Test overlong 2-byte
     for byt in 0x81:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[0xc0,byt]))
     end
     for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[0xc1,byt]))
     end
 
     # Test overlong 3-byte
     for byt in 0x80:0x9f
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[0xe0,byt,0x80]))
     end
 
     # Test overlong 4-byte
     for byt in 0x80:0x8f
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[0xef,byt,0x80,0x80]))
     end
 
     # Test 4-byte > 0x10ffff
     for byt in 0x90:0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[0xf4,byt,0x80,0x80]))
     end
     for byt in 0xf5:0xf7
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80]))
     end
 
     # Test 5-byte
     for byt in 0xf8:0xfb
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80]))
     end
 
     # Test 6-byte
     for byt in 0xfc:0xfd
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
     end
 
     # Test 7-byte
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
+    @test_throws UnicodeError convert(T, strval(T, UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
 
     # Three and above byte sequences
     for byt in 0xe0:0xef
         # Lead followed by only 1 continuation byte
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80]))
         # Lead ended by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0]))
         # Lead ended by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0xc0]))
     end
 
     # 3-byte encoded surrogate character(s)
     # Single surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80]))
     # Not followed by surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
+    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
     # Trailing surrogate first
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
     # Followed by lead surrogate
-    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
+    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
 
     # Four byte sequences
     for byt in 0xf0:0xf4
         # Lead followed by only 2 continuation bytes
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80]))
         # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0]))
         # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
+        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0xc0]))
     end
     catch exp ;
         println("Error checking $T: $byt")