Skip to content

Commit

Permalink
Move some code from utf32.jl to utf16.jl and utf8.jl, hopefully more …
Browse files Browse the repository at this point in the history
…logical
  • Loading branch information
ScottPJones committed Jul 10, 2015
1 parent cab2e4c commit 4424f42
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 97 deletions.
64 changes: 36 additions & 28 deletions base/unicode/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -195,44 +195,52 @@ function convert(::Type{UTF8String}, str::UTF16String)
end

"
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
Converts a vector of `Char` to a `UTF16String`
### Returns:
* `::UTF16String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF16String}, chrs::Vector{Char})
len = sizeof(chrs)
# handle zero length string quickly
len == 0 && return empty_utf16
dat = reinterpret(UInt32, chrs)
# get number of words to allocate
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
len += num4byte + 1
# optimized path, no surrogates
num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat)
return encode_to_utf16(dat, len)
end

"
Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
### Input Arguments:
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
* `len` length of output in bytes
* `dat::Vector{UInt32}` UTF-32 encoded data
* `len` length of output in 16-bit words
### Returns:
* `UTF8String`
* `::UTF16String`
"
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
buf = Vector{UInt8}(len)
function encode_to_utf16(dat, len)
buf = Vector{UInt16}(len)
@inbounds buf[len] = 0 # NULL termination
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle 0x80-0x7ff
elseif ch < 0x800
buf[out += 1] = 0xc0 | (ch >>> 6)
buf[out += 1] = 0x80 | (ch & 0x3f)
# Handle 0x10000-0x10ffff (if input is UInt32)
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
output_utf8_4byte!(buf, out, ch)
out += 4
# Handle surrogate pairs
elseif is_surrogate_codeunit(ch)
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
out += 4
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
else
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
buf[out += 1] = 0x80 | (ch & 0x3f)
ch = UInt32(dat[pos += 1])
if ch > 0xffff
# Output surrogate pair for 0x10000-0x10ffff
buf[out += 1] = 0xd7c0 + (ch >>> 10)
ch = 0xdc00 + (ch & 0x3ff)
end
buf[out += 1] = ch
end
UTF8String(buf)
UTF16String(buf)
end

function convert(::Type{UTF16String}, str::ASCIIString)
Expand Down
69 changes: 0 additions & 69 deletions base/unicode/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,6 @@ function convert(::Type{UTF32String}, str::AbstractString)
UTF32String(buf)
end

"
Converts a vector of `Char` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, chrs::Vector{Char})
len = sizeof(chrs)
# handle zero length string quickly
len == 0 && return empty_utf8
dat = reinterpret(UInt32, chrs)
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>2)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts a `UTF32String` to a `UTF8String`
Expand Down Expand Up @@ -158,28 +138,6 @@ function convert(::Type{UTF32String}, str::UTF16String)
UTF32String(buf)
end

"
Converts a vector of `Char` to a `UTF16String`
### Returns:
* `::UTF16String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF16String}, chrs::Vector{Char})
len = sizeof(chrs)
# handle zero length string quickly
len == 0 && return empty_utf16
dat = reinterpret(UInt32, chrs)
# get number of words to allocate
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
len += num4byte + 1
# optimized path, no surrogates
num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat)
return encode_to_utf16(dat, len)
end

"
Converts a `UTF32String` to `UTF16String`
Expand All @@ -201,33 +159,6 @@ function convert(::Type{UTF16String}, str::UTF32String)
return encode_to_utf16(dat, len + num4byte)
end

"
Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
### Input Arguments:
* `dat::Vector{UInt32}` UTF-32 encoded data
* `len` length of output in 16-bit words
### Returns:
* `::UTF16String`
"
function encode_to_utf16(dat, len)
buf = Vector{UInt16}(len)
@inbounds buf[len] = 0 # NULL termination
out = 0
pos = 0
@inbounds while out < len
ch = UInt32(dat[pos += 1])
if ch > 0xffff
# Output surrogate pair for 0x10000-0x10ffff
buf[out += 1] = 0xd7c0 + (ch >>> 10)
ch = 0xdc00 + (ch & 0x3ff)
end
buf[out += 1] = ch
end
UTF16String(buf)
end

convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])

function convert(::Type{UTF32String}, str::ASCIIString)
Expand Down
61 changes: 61 additions & 0 deletions base/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,67 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr
end
convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s))

"
Converts a vector of `Char` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, chrs::Vector{Char})
len = sizeof(chrs)
# handle zero length string quickly
len == 0 && return empty_utf8
dat = reinterpret(UInt32, chrs)
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>2)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Input Arguments:
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
* `len` length of output in bytes
### Returns:
* `UTF8String`
"
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle 0x80-0x7ff
elseif ch < 0x800
buf[out += 1] = 0xc0 | (ch >>> 6)
buf[out += 1] = 0x80 | (ch & 0x3f)
# Handle 0x10000-0x10ffff (if input is UInt32)
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
output_utf8_4byte!(buf, out, ch)
out += 4
# Handle surrogate pairs
elseif is_surrogate_codeunit(ch)
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
out += 4
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
else
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
buf[out += 1] = 0x80 | (ch & 0x3f)
end
end
UTF8String(buf)
end

utf8(p::Ptr{UInt8}) = UTF8String(bytestring(p))
utf8(p::Ptr{UInt8}, len::Integer) = utf8(pointer_to_array(p, len))

Expand Down

0 comments on commit 4424f42

Please sign in to comment.