Reorganize UTF handling files

ScottPJones · Jun 2, 2015 · 21bbaef · 21bbaef
1 parent fc0364b
commit 21bbaef
Show file tree

Hide file tree

Showing 7 changed files with 461 additions and 440 deletions.
diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -84,10 +84,15 @@ include("iterator.jl")
 include("osutils.jl")
 
 # strings & printing
+include("utferror.jl")
+include("utftype.jl")
+include("utfcheck.jl")
 include("char.jl")
 include("ascii.jl")
 include("utf8.jl")
-include("utf.jl")
+include("utf16.jl")
+include("utf32.jl")
+include("utfconvert.jl")
 include("iobuffer.jl")
 include("string.jl")
 include("utf8proc.jl")

diff --git a/base/utf16.jl b/base/utf16.jl
@@ -0,0 +1,72 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+function length(s::UTF16String)
+    d = s.data
+    len = length(d) - 1
+    len == 0 && return 0
+    cnum = 0
+    for i = 1:len
+        @inbounds cnum += !is_surrogate_trail(d[i])
+    end
+    cnum
+end
+
+function endof(s::UTF16String)
+    d = s.data
+    i = length(d) - 1
+    i == 0 && return i
+    return is_surrogate_codeunit(d[i]) ? i-1 : i
+end
+
+get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
+
+function next(s::UTF16String, i::Int)
+    ch = s.data[i]
+    !is_surrogate_codeunit(ch) && return (Char(ch), i+1)
+    # check length, account for terminating \0
+    i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
+    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
+    ct = s.data[i+1]
+    !is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
+    Char(get_supplementary(ch, ct)), i+2
+end
+
+function reverseind(s::UTF16String, i::Integer)
+    j = length(s.data) - i
+    return is_surrogate_trail(s.data[j]) ? j-1 : j
+end
+
+lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
+
+function reverse(s::UTF16String)
+    d = s.data
+    out = similar(d)
+    out[end] = 0 # NULL termination
+    n = length(d)
+    @inbounds for i = 1:n-1
+        ch = d[n-i]
+        if is_surrogate_lead(ch)
+            out[i],out[i-1] = out[i-1],ch
+        else
+            out[i] = ch
+        end
+    end
+    UTF16String(out)
+end
+
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
+
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
+    i = 1
+    n = length(data) # this may include NULL termination; that's okay
+    @inbounds while i < n # check for unpaired surrogates
+        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
+            i += 2
+        elseif is_surrogate_codeunit(data[i])
+            return false
+        else
+            i += 1
+        end
+    end
+    return i > n || !is_surrogate_codeunit(data[i])
+end
diff --git a/base/utf32.jl b/base/utf32.jl
@@ -0,0 +1,33 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# UTF-32 basic functions
+next(s::UTF32String, i::Int) = (s.data[i], i+1)
+endof(s::UTF32String) = length(s.data) - 1
+length(s::UTF32String) = length(s.data) - 1
+
+reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
+
+sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
+
+function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
+    for i=1:length(str)
+        @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
+    end
+    return true
+end
+isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
+
+function map(f, s::UTF32String)
+    d = s.data
+    out = similar(d)
+    out[end] = 0
+
+    @inbounds for i = 1:(length(d)-1)
+        c2 = f(d[i])
+        if !isa(c2, Char)
+            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
+        end
+        out[i] = (c2::Char)
+    end
+    UTF32String(out)
+end
diff --git a/base/utfcheck.jl b/base/utfcheck.jl
@@ -0,0 +1,255 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
+# and also to return information necessary to convert to other encodings
+
+is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
+is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
+is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
+is_valid_continuation(c) = ((c & 0xc0) == 0x80)
+
+# Options for check_string_* functions
+
+const UTF_NO_LONG_NULL = 1      # don't accept 0xc0 0x80 for '\0'
+const UTF_NO_SURROGATES = 2     # don't accept surrogate pairs in UTF-8/UTF-32
+const UTF_ACCEPT_LONG = 4       # accept long encodings (other than long null in UTF-8)
+
+const UTF_LONG = 1              # Long encodings are present
+const UTF_LATIN1 = 2            # characters in range 0x80-0xFF present
+const UTF_UNICODE2 = 4          # characters in range 0x100-0x7ff present
+const UTF_UNICODE3 = 8          # characters in range 0x800-0xd7ff, 0xe000-0xffff
+const UTF_UNICODE4 = 16         # non-BMP characters present
+const UTF_SURROGATE = 32        # surrogate pairs present
+
+# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
+@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
+    !is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
+    (ch << 6) | (byt & 0x3f)
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
+
+@param[in]  str     Vector of UInt8
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
+    local byt::UInt8, ch::UInt32, surr::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
+    len = sizeof(dat)
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            # Check UTF-8 encoding
+            if ch < 0xe0
+                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
+                (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
+                if ch > 0x7f
+                    num2byte += 1
+                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
+                    flags |= UTF_LONG
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos, ch)
+                end
+             elseif ch < 0xf0
+                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
+                (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                # check for surrogate pairs, make sure correct
+                if is_surrogate_codeunit(ch)
+                    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
+                    # next character *must* be a trailing surrogate character
+                    (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
+                    byt = dat[pos += 1]
+                    (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
+                    surr = get_continuation(0x0000d, dat[pos += 1], pos)
+                    surr = get_continuation(surr, dat[pos += 1], pos)
+                    !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
+                    (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
+                    flags |= UTF_SURROGATE
+                    num4byte += 1
+                elseif ch > 0x07ff
+                    num3byte += 1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                    num2byte += 1
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
+                end
+            elseif ch < 0xf5
+                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
+                (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
+                ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                if ch > 0x10ffff
+                    utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
+                elseif ch > 0xffff
+                    num4byte += 1
+                elseif is_surrogate_codeunit(ch)
+                    utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    # This is an overly long encode character
+                    flags |= UTF_LONG
+                    if ch > 0x7ff
+                        num3byte += 1
+                    elseif ch > 0x7f
+                        num2byte += 1
+                    end
+                else
+                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
+                end
+            else
+                utf_errfunc(UTF_ERR_INVALID, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-16 encoded vector of UInt16
+
+@param[in]  dat     Vector{UInt16}
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf16(dat::Vector{UInt16}, len::Int)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif !is_surrogate_codeunit(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch = dat[pos += 1]
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+#=
+@doc """
+@brief      Validates and calculates number of characters in a UTF-32 encoded vector of UInt32
+
+@param[in]  dat     Vector{UInt32}
+@param[in]  options flags to determine error handling (default 0)
+
+@return     (total characters, flags, 4-byte, 3-byte, 2-byte)
+@throws     ArgumentError
+""" ->
+=#
+function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif ch > 0xffff
+                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
+                num4byte += 1
+            elseif !is_surrogate_codeunit(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch = dat[pos += 1]
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
+                flags |= UTF_SURROGATE
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+function check_string_abs(str::AbstractString, options::Integer=0)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    pos = start(str)
+    len = endof(str)
+    @inbounds while pos < len
+        ch, pos = next(str, pos)
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif ch > 0xffff
+                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
+                num4byte += 1
+            elseif !is_surrogate_codeunit(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
+                # next character *must* be a trailing surrogate character
+                ch, pos = next(str, pos)
+                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
+                num4byte += 1
+                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
+                flags |= UTF_SURROGATE
+            else
+                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end