From 48cf84fd0fb65f7b57bd45a593e3b12b6c33badb Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Sun, 14 Jun 2015 10:34:51 -0400 Subject: [PATCH] Updated to use unsafe_checkstring, fix comments --- base/utf8.jl | 8 +- base/utfconvert.jl | 444 --------------------------------------------- 2 files changed, 2 insertions(+), 450 deletions(-) delete mode 100644 base/utfconvert.jl diff --git a/base/utf8.jl b/base/utf8.jl index 881f694b53632a..db246148c517f7 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -220,12 +220,8 @@ convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) " Converts a UTF-8 encoded vector of `UInt8` to a `UTF8String` -### Input Arguments: -* `::Type{UTF8String}` -* `dat::Vector{UInt8}` - ### Returns: -* `::UTF8String` +* `UTF8String` ### Throws: * `UnicodeError` @@ -234,7 +230,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt8}) # handle zero length string quickly isempty(dat) && return empty_utf8 # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string(dat) + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat) if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 len = sizeof(dat) @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) diff --git a/base/utfconvert.jl b/base/utfconvert.jl deleted file mode 100644 index cd5b12cb8b068b..00000000000000 --- a/base/utfconvert.jl +++ /dev/null @@ -1,444 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# Functions to convert to different UTF encodings - -# Quickly copy and set trailing \0 -@inline function fast_utf_copy(T::Type{UInt16}, len, dat) - @inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len)) -end -@inline function fast_utf_copy(T::Type{Char}, len, dat) - @inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len)) -end - -# Get rest of character ch from 3-byte UTF-8 sequence in dat -@inline function get_utf8_3(dat, pos, ch) - @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) -end - -# Get rest of character ch from 4-byte UTF-8 sequence in dat -@inline function get_utf8_4(dat, pos, ch) - @inbounds return (((ch & 0x7) << 18) - | (UInt32(dat[pos-2] & 0x3f) << 12) - | (UInt32(dat[pos-1] & 0x3f) << 6) - | (dat[pos] & 0x3f)) -end - -# Output a character as a 4-byte UTF-8 sequence -@inline function output_utf8_4(buf, out, ch) - @inbounds begin - buf[out + 1] = 0xf0 | (ch >>> 18) - buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) - buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) - buf[out + 4] = 0x80 | (ch & 0x3f) - end -end - -#= -""" -@brief Converts an AbstractString to a UTF16String - -@param[in] ::Type{UTF16String} -@param[in] str::AbstractString - -@return ::UTF16String -@throws ArgumentError -""" -=# -function convert(::Type{UTF16String}, str::AbstractString) - len, flags, num4byte = check_string_abs(str) - buf = Vector{UInt16}(len+num4byte+1) - out = 0 - @inbounds for ch in str - c = UInt32(ch) - if c < 0x10000 - buf[out += 1] = UInt16(c) - else - # output surrogate pair - buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) - buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) - end - end - @inbounds buf[out + 1] = 0 # NULL termination - UTF16String(buf) -end - -#= -""" -@brief Converts an AbstractString to a UTF32String - -@param[in] ::Type{UTF32String} -@param[in] str::AbstractString - -@return ::UTF32String -@throws ArgumentError -""" -=# -function convert(::Type{UTF32String}, str::AbstractString) - len, flags = check_string_abs(str) - buf = Vector{Char}(len+1) - out = 0 - @inbounds for ch in str ; buf[out += 1] = ch ; end - @inbounds buf[out + 1] = 0 # NULL termination - UTF32String(buf) -end - -#= -@doc """ -@brief Converts a UTF8String to a UTF16String - -@param[in] ::Type{UTF16String} -@param[in] str::UTF8String - -@return ::UTF16String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF16String}, str::UTF8String) - dat = str.data - # handle zero length string quickly - sizeof(dat) == 0 && return empty_utf16 - # Check that is correct UTF-8 encoding and get number of words needed - len, flags, num4byte = check_string_utf8(dat) - len += num4byte - buf = Vector{UInt16}(len+1) - @inbounds buf[len+1] = 0 - # Optimize case where no characters > 0x7f - flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) - out = 0 - pos = 0 - @inbounds while out < len - ch::UInt32 = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle range 0x80-0x7ff - elseif ch < 0xe0 - buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) - # Handle range 0x800-0xffff - elseif ch < 0xf0 - pos += 2 - buf[out += 1] = get_utf8_3(dat, pos, ch) - # Handle range 0x10000-0x10ffff - else - pos += 3 - ch = get_utf8_4(dat, pos, ch) - # output surrogate pair - buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) - buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) - end - end - UTF16String(buf) -end - -#= -@doc """ -@brief Converts a UTF-16 encoded vector of UInt16 to a UTF8String - -@param[in] ::Type{UTF8String} -@param[in] dat::Vector{UInt16} - -@return ::UTF8String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF8String}, dat::Vector{UInt16}) - len = sizeof(dat) - # handle zero length string quickly - len == 0 && return UTF8String("") - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat)) - return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -#= -@doc """ -@brief Converts a UTF16String to a UTF8String - -@param[in] ::Type{UTF8String} -@param[in] str::UTF16String - -@return ::UTF8String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF8String}, str::UTF16String) - dat = str.data - len = sizeof(dat) >>> 1 - # handle zero length string quickly - len <= 1 && return UTF8String("") - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -#= -@doc """ -@brief Encodes a UTF-32 encoded vector of UInt32 to a UTF8String - -@param[in] ::Type{UTF8String} -@param[in] dat::Vector{UInt32} - -@return ::UTF8String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF8String}, dat::Vector{UInt32}) - len = sizeof(dat) - # handle zero length string quickly - len == 0 && return UTF8String("") - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -#= -@doc """ -@brief Converts a UTF32String to a UTF8String - -@param[in] ::Type{UTF8String} -@param[in] str::UTF32String - -@return ::UTF8String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF8String}, str::UTF32String) - dat = reinterpret(UInt32, str.data) - len = sizeof(dat) >>> 2 - # handle zero length string quickly - len <= 1 && return UTF8String("") - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -#= -@doc """ -@brief Converts an already validated vector of UInt16 or UInt32 to a UTF8String - -@param[in] T type (UInt16 or UInt32) -@param[in] dat Vector{T} -@param[in] len length of output in bytes - -@return ::UTF8String -""" -> -=# -function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) - buf = Vector{UInt8}(len) - out = 0 - pos = 0 - @inbounds while out < len - ch::UInt32 = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle 0x80-0x7ff - elseif ch < 0x800 - buf[out += 1] = 0xc0 | (ch >>> 6) - buf[out += 1] = 0x80 | (ch & 0x3f) - # Handle 0x10000-0x10ffff (if input is UInt32) - elseif T == UInt32 && ch > 0xffff - output_utf8_4(buf, out, ch) - out += 4 - # Handle surrogate pairs - elseif is_surrogate_codeunit(ch) - output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1])) - out += 4 - # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters - else - buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) - buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) - buf[out += 1] = 0x80 | (ch & 0x3f) - end - end - UTF8String(buf) -end - -#= -""" -@brief Converts a UTF8String to a UTF32String - -@param[in] ::Type{UTF32String} -@param[in] str::UTF8String - -@return ::UTF32String -@throws ArgumentError -""" -=# -function convert(::Type{UTF32String}, str::UTF8String) - dat = str.data - # handle zero length string quickly - sizeof(dat) == 0 && return empty_utf32 - # Validate UTF-8 encoding, and get number of words to create - len, flags = check_string_utf8(dat) - # Optimize case where no characters > 0x7f - totlen = len+1 - flags == 0 && return fast_utf_copy(Char, totlen, dat) - # has multi-byte UTF-8 sequences - buf = Vector{Char}(totlen) - @inbounds buf[totlen] = 0 # NULL termination - local ch::UInt32, surr::UInt32 - out = 0 - pos = 0 - @inbounds while out < len - ch = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle range 0x80-0x7ff - elseif ch < 0xe0 - buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) - # Handle range 0x800-0xffff - elseif ch < 0xf0 - pos += 2 - ch = get_utf8_3(dat, pos, ch) - # Handle surrogate pairs (should have been encoded in 4 bytes) - if is_surrogate_lead(ch) - # Build up 32-bit character from ch and trailing surrogate in next 3 bytes - pos += 3 - surr = ((UInt32(dat[pos-2] & 0xf) << 12) - | (UInt32(dat[pos-1] & 0x3f) << 6) - | (dat[pos] & 0x3f)) - ch = get_supplementary(ch, surr) - end - buf[out += 1] = ch - # Handle range 0x10000-0x10ffff - else - pos += 3 - buf[out += 1] = get_utf8_4(dat, pos, ch) - end - end - UTF32String(buf) -end - -#= -""" -@brief Converts a UTF16String to UTF32String - -@param[in] ::Type{UTF32String} -@param[in] str::UTF16String - -@return ::UTF32String -@throws ArgumentError -""" -=# -function convert(::Type{UTF32String}, str::UTF16String) - dat = str.data - len = sizeof(dat) - # handle zero length string quickly (account for trailing \0) - len <= 2 && return empty_utf32 - # get number of words to create - len, flags, num4byte = check_string_utf16(dat, len>>>1) - # No surrogate pairs, do optimized copy - (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) - local ch::UInt32 - buf = Vector{Char}(len) - out = 0 - pos = 0 - @inbounds while out < len - ch = dat[pos += 1] - # check for surrogate pair - if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end - buf[out += 1] = ch - end - UTF32String(buf) -end - -#= -""" -@brief Converts a UTF-32 encoded vector of UInt32 to a UTF16String - -@param[in] ::Type{UTF16String} -@param[in] dat::Vector{UInt32} - -@return ::UTF16String -@throws ArgumentError -""" -=# -function convert(::Type{UTF16String}, dat::Vector{UInt32}) - len = sizeof(dat) - # handle zero length string quickly - len <= 4 && return empty_utf16 - # get number of words to allocate - len, flags, num4byte = check_string_utf32(dat, len>>>2) - len += num4byte + 1 - # optimized path, no surrogates - num4byte == 0 && return fast_utf_copy(UInt16, len, dat) - return encode_to_utf16(dat, len) -end - -#= -""" -@brief Converts a UTF32String to UTF16String - -@param[in] ::Type{UTF16String} -@param[in] str::UTF32String - -@return ::UTF16String -@throws ArgumentError -""" -=# -function convert(::Type{UTF16String}, str::UTF32String) - dat = reinterpret(UInt32, str.data) - len = sizeof(dat) - # handle zero length string quickly - len <= 4 && return empty_utf16 - # get number of words to allocate - len, flags, num4byte = check_string_utf32(dat, len>>>2) - # optimized path, no surrogates - num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) - return encode_to_utf16(dat, len + num4byte) -end - -#= -@doc """ -@brief Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String - -@param[in] dat::Vector{UInt32} UTF-32 encoded data -@param[in] len length of output in 16-bit words - -@return ::UTF16String -""" -> -=# -function encode_to_utf16(dat, len) - buf = Vector{UInt16}(len) - @inbounds buf[len] = 0 # NULL termination - out = 0 - pos = 0 - @inbounds while out < len - ch = UInt32(dat[pos += 1]) - if ch > 0xffff - # Output surrogate pair for 0x10000-0x10ffff - buf[out += 1] = 0xd7c0 + (ch >>> 10) - ch = 0xdc00 + (ch & 0x3ff) - end - buf[out += 1] = ch - end - UTF16String(buf) -end - -convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat)) - -function convert(::Type{UTF16String}, str::ASCIIString) - dat = str.data - fast_utf_copy(UInt16, length(dat)+1, dat) -end - -function convert(::Type{UTF32String}, str::ASCIIString) - dat = str.data - fast_utf_copy(Char, length(dat)+1, dat) -end - -convert(::Type{UTF16String}, str::UTF16String) = str -convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat)) - -convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data -convert(::Type{Array{UInt16}}, str::UTF16String) = str.data - -convert(::Type{UTF32String}, str::UTF32String) = str - -convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])