Skip to content

Commit

Permalink
Merge pull request JuliaLang#12360 from ScottPJones/spj/cesu8fix
Browse files Browse the repository at this point in the history
Fix a bug handling CESU-8 strings in convert(UTF8String, Vector{UInt8}
  • Loading branch information
JeffBezanson committed Jul 29, 2015
2 parents 6fafbbc + f931a0d commit 5e12592
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
6 changes: 3 additions & 3 deletions base/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,9 @@ function convert(::Type{UTF8String}, dat::Vector{UInt8})
buf[out += 1] = dat[pos += 1]
else
# Pick up surrogate pairs (CESU-8 format)
ch = (((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+ (((dat[pos + 3] & 0x3f) << 6) | (dat[pos + 4] & 0x3f))
- 0xc00)
ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
+ (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f)))
- 0x01f0c00)
pos += 4
output_utf8_4byte!(buf, out, ch)
out += 4
Expand Down
1 change: 1 addition & 0 deletions test/unicode.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

include("unicode/checkstring.jl")
include("unicode/utf8.jl")
include("unicode/utf16.jl")
include("unicode/utf32.jl")
include("unicode/utf8proc.jl")
12 changes: 12 additions & 0 deletions test/unicode/utf8.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## Test for CESU-8 sequences

let ch = 0x10000
for hichar = 0xd800:0xdbff
for lochar = 0xdc00:0xdfff
@test convert(UTF8String, utf8(Char[hichar, lochar]).data) == string(Char(ch))
ch += 1
end
end
end

0 comments on commit 5e12592

Please sign in to comment.