From e77b560ecae3ea6ab72b23682862616ee9ea5b28 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Tue, 28 Jul 2015 18:08:36 -0400 Subject: [PATCH 1/2] Fix incorrect handling of CESU-8 string in convert(UTF8String, Vector{UInt8} --- base/unicode/utf8.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl index ec59ec5083ac0..2028f942285b0 100644 --- a/base/unicode/utf8.jl +++ b/base/unicode/utf8.jl @@ -267,9 +267,9 @@ function convert(::Type{UTF8String}, dat::Vector{UInt8}) buf[out += 1] = dat[pos += 1] else # Pick up surrogate pairs (CESU-8 format) - ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) - + (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f)) - - 0xc00) + ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) + + (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f))) + - 0x01f0c00) pos += 4 output_utf8_4byte!(buf, out, ch) out += 4 From f931a0d552bd7d0263fed4c96a766fc78bd1b5c7 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Tue, 28 Jul 2015 22:06:19 -0400 Subject: [PATCH 2/2] Add tests for convert with CESU-8 input --- test/unicode.jl | 1 + test/unicode/utf8.jl | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 test/unicode/utf8.jl diff --git a/test/unicode.jl b/test/unicode.jl index 1e3c384306cd0..862aa7cf2691d 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -1,6 +1,7 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license include("unicode/checkstring.jl") +include("unicode/utf8.jl") include("unicode/utf16.jl") include("unicode/utf32.jl") include("unicode/utf8proc.jl") \ No newline at end of file diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl new file mode 100644 index 0000000000000..af576c52733f9 --- /dev/null +++ b/test/unicode/utf8.jl @@ -0,0 +1,12 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +## Test for CESU-8 sequences + +let ch = 0x10000 + for hichar = 0xd800:0xdbff + for lochar = 0xdc00:0xdfff + @test convert(UTF8String, utf8(Char[hichar, lochar]).data) == string(Char(ch)) + ch += 1 + end + end +end