Skip to content

Commit

Permalink
Reorganize UTF handling files
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jun 2, 2015
1 parent fc0364b commit 21bbaef
Show file tree
Hide file tree
Showing 7 changed files with 461 additions and 440 deletions.
7 changes: 6 additions & 1 deletion base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,15 @@ include("iterator.jl")
include("osutils.jl")

# strings & printing
include("utferror.jl")
include("utftype.jl")
include("utfcheck.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
include("utf.jl")
include("utf16.jl")
include("utf32.jl")
include("utfconvert.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
Expand Down
72 changes: 72 additions & 0 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

function length(s::UTF16String)
d = s.data
len = length(d) - 1
len == 0 && return 0
cnum = 0
for i = 1:len
@inbounds cnum += !is_surrogate_trail(d[i])
end
cnum
end

function endof(s::UTF16String)
d = s.data
i = length(d) - 1
i == 0 && return i
return is_surrogate_codeunit(d[i]) ? i-1 : i
end

get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)

function next(s::UTF16String, i::Int)
ch = s.data[i]
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
# check length, account for terminating \0
i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
ct = s.data[i+1]
!is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
Char(get_supplementary(ch, ct)), i+2
end

function reverseind(s::UTF16String, i::Integer)
j = length(s.data) - i
return is_surrogate_trail(s.data[j]) ? j-1 : j
end

lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator

function reverse(s::UTF16String)
d = s.data
out = similar(d)
out[end] = 0 # NULL termination
n = length(d)
@inbounds for i = 1:n-1
ch = d[n-i]
if is_surrogate_lead(ch)
out[i],out[i-1] = out[i-1],ch
else
out[i] = ch
end
end
UTF16String(out)
end

sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
@inbounds while i < n # check for unpaired surrogates
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
i += 2
elseif is_surrogate_codeunit(data[i])
return false
else
i += 1
end
end
return i > n || !is_surrogate_codeunit(data[i])
end
33 changes: 33 additions & 0 deletions base/utf32.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

# UTF-32 basic functions
next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1

reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))

sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)

function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
for i=1:length(str)
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
end
return true
end
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)

function map(f, s::UTF32String)
d = s.data
out = similar(d)
out[end] = 0

@inbounds for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
end
out[i] = (c2::Char)
end
UTF32String(out)
end
255 changes: 255 additions & 0 deletions base/utfcheck.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

# Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
# and also to return information necessary to convert to other encodings

is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
is_valid_continuation(c) = ((c & 0xc0) == 0x80)

# Options for check_string_* functions

const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0'
const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32
const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8)

const UTF_LONG = 1 # Long encodings are present
const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present
const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present
const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff
const UTF_UNICODE4 = 16 # non-BMP characters present
const UTF_SURROGATE = 32 # surrogate pairs present

# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
!is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
(ch << 6) | (byt & 0x3f)
end

#=
@doc """
@brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
@param[in] str Vector of UInt8
@param[in] options flags to determine error handling (default 0)
@return (total characters, flags, 4-byte, 3-byte, 2-byte)
@throws ArgumentError
""" ->
=#
function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
local byt::UInt8, ch::UInt32, surr::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = 0
len = sizeof(dat)
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
if ch > 0x7f
# Check UTF-8 encoding
if ch < 0xe0
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
(pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
if ch > 0x7f
num2byte += 1
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
elseif (options & UTF_ACCEPT_LONG) != 0
flags |= UTF_LONG
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
flags |= UTF_LONG
else
utf_errfunc(UTF_ERR_LONG, pos, ch)
end
elseif ch < 0xf0
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
(pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
ch = get_continuation(ch, dat[pos += 1], pos)
# check for surrogate pairs, make sure correct
if is_surrogate_codeunit(ch)
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
# next character *must* be a trailing surrogate character
(pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
byt = dat[pos += 1]
(byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
surr = get_continuation(0x0000d, dat[pos += 1], pos)
surr = get_continuation(surr, dat[pos += 1], pos)
!is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
flags |= UTF_SURROGATE
num4byte += 1
elseif ch > 0x07ff
num3byte += 1
elseif (options & UTF_ACCEPT_LONG) != 0
flags |= UTF_LONG
num2byte += 1
else
utf_errfunc(UTF_ERR_LONG, pos-2, ch)
end
elseif ch < 0xf5
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
(pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
ch = get_continuation(ch, dat[pos += 1], pos)
ch = get_continuation(ch, dat[pos += 1], pos)
if ch > 0x10ffff
utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
elseif ch > 0xffff
num4byte += 1
elseif is_surrogate_codeunit(ch)
utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
elseif (options & UTF_ACCEPT_LONG) != 0
# This is an overly long encode character
flags |= UTF_LONG
if ch > 0x7ff
num3byte += 1
elseif ch > 0x7f
num2byte += 1
end
else
utf_errfunc(UTF_ERR_LONG, pos-2, ch)
end
else
utf_errfunc(UTF_ERR_INVALID, pos, ch)
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end

#=
@doc """
@brief Validates and calculates number of characters in a UTF-16 encoded vector of UInt16
@param[in] dat Vector{UInt16}
@param[in] options flags to determine error handling (default 0)
@return (total characters, flags, 4-byte, 3-byte, 2-byte)
@throws ArgumentError
""" ->
=#
function check_string_utf16(dat::Vector{UInt16}, len::Int)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = 0
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
if ch > 0x7f
if ch < 0x100
num2byte += 1
flags |= UTF_LATIN1
elseif ch < 0x800
num2byte += 1
flags |= UTF_UNICODE2
elseif !is_surrogate_codeunit(ch)
num3byte += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
ch = dat[pos += 1]
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
num4byte += 1
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end

#=
@doc """
@brief Validates and calculates number of characters in a UTF-32 encoded vector of UInt32
@param[in] dat Vector{UInt32}
@param[in] options flags to determine error handling (default 0)
@return (total characters, flags, 4-byte, 3-byte, 2-byte)
@throws ArgumentError
""" ->
=#
function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = 0
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
if ch > 0x7f
if ch < 0x100
num2byte += 1
flags |= UTF_LATIN1
elseif ch < 0x800
num2byte += 1
flags |= UTF_UNICODE2
elseif ch > 0xffff
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
num4byte += 1
elseif !is_surrogate_codeunit(ch)
num3byte += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
ch = dat[pos += 1]
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
num4byte += 1
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
flags |= UTF_SURROGATE
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end

function check_string_abs(str::AbstractString, options::Integer=0)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = start(str)
len = endof(str)
@inbounds while pos < len
ch, pos = next(str, pos)
totalchar += 1
if ch > 0x7f
if ch < 0x100
num2byte += 1
flags |= UTF_LATIN1
elseif ch < 0x800
num2byte += 1
flags |= UTF_UNICODE2
elseif ch > 0xffff
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
num4byte += 1
elseif !is_surrogate_codeunit(ch)
num3byte += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
ch, pos = next(str, pos)
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
num4byte += 1
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
flags |= UTF_SURROGATE
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end
Loading

0 comments on commit 21bbaef

Please sign in to comment.