forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fc0364b
commit 21bbaef
Showing
7 changed files
with
461 additions
and
440 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
function length(s::UTF16String) | ||
d = s.data | ||
len = length(d) - 1 | ||
len == 0 && return 0 | ||
cnum = 0 | ||
for i = 1:len | ||
@inbounds cnum += !is_surrogate_trail(d[i]) | ||
end | ||
cnum | ||
end | ||
|
||
function endof(s::UTF16String) | ||
d = s.data | ||
i = length(d) - 1 | ||
i == 0 && return i | ||
return is_surrogate_codeunit(d[i]) ? i-1 : i | ||
end | ||
|
||
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) | ||
|
||
function next(s::UTF16String, i::Int) | ||
ch = s.data[i] | ||
!is_surrogate_codeunit(ch) && return (Char(ch), i+1) | ||
# check length, account for terminating \0 | ||
i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)) | ||
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch) | ||
ct = s.data[i+1] | ||
!is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch) | ||
Char(get_supplementary(ch, ct)), i+2 | ||
end | ||
|
||
function reverseind(s::UTF16String, i::Integer) | ||
j = length(s.data) - i | ||
return is_surrogate_trail(s.data[j]) ? j-1 : j | ||
end | ||
|
||
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator | ||
|
||
function reverse(s::UTF16String) | ||
d = s.data | ||
out = similar(d) | ||
out[end] = 0 # NULL termination | ||
n = length(d) | ||
@inbounds for i = 1:n-1 | ||
ch = d[n-i] | ||
if is_surrogate_lead(ch) | ||
out[i],out[i-1] = out[i-1],ch | ||
else | ||
out[i] = ch | ||
end | ||
end | ||
UTF16String(out) | ||
end | ||
|
||
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) | ||
|
||
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) | ||
i = 1 | ||
n = length(data) # this may include NULL termination; that's okay | ||
@inbounds while i < n # check for unpaired surrogates | ||
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) | ||
i += 2 | ||
elseif is_surrogate_codeunit(data[i]) | ||
return false | ||
else | ||
i += 1 | ||
end | ||
end | ||
return i > n || !is_surrogate_codeunit(data[i]) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
# UTF-32 basic functions | ||
next(s::UTF32String, i::Int) = (s.data[i], i+1) | ||
endof(s::UTF32String) = length(s.data) - 1 | ||
length(s::UTF32String) = length(s.data) - 1 | ||
|
||
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) | ||
|
||
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) | ||
|
||
function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) | ||
for i=1:length(str) | ||
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end | ||
end | ||
return true | ||
end | ||
isvalid(str::Vector{Char}) = isvalid(UTF32String, str) | ||
|
||
function map(f, s::UTF32String) | ||
d = s.data | ||
out = similar(d) | ||
out[end] = 0 | ||
|
||
@inbounds for i = 1:(length(d)-1) | ||
c2 = f(d[i]) | ||
if !isa(c2, Char) | ||
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) | ||
end | ||
out[i] = (c2::Char) | ||
end | ||
UTF32String(out) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,255 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
# Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings, | ||
# and also to return information necessary to convert to other encodings | ||
|
||
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) | ||
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) | ||
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800) | ||
is_valid_continuation(c) = ((c & 0xc0) == 0x80) | ||
|
||
# Options for check_string_* functions | ||
|
||
const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' | ||
const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 | ||
const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) | ||
|
||
const UTF_LONG = 1 # Long encodings are present | ||
const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present | ||
const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present | ||
const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff | ||
const UTF_UNICODE4 = 16 # non-BMP characters present | ||
const UTF_SURROGATE = 32 # surrogate pairs present | ||
|
||
# Get a UTF-8 continuation byte, give error if invalid, and update position and character value | ||
@inline function get_continuation(ch::UInt32, byt::UInt8, pos) | ||
!is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt) | ||
(ch << 6) | (byt & 0x3f) | ||
end | ||
|
||
#= | ||
@doc """ | ||
@brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8 | ||
@param[in] str Vector of UInt8 | ||
@param[in] options flags to determine error handling (default 0) | ||
@return (total characters, flags, 4-byte, 3-byte, 2-byte) | ||
@throws ArgumentError | ||
""" -> | ||
=# | ||
function check_string_utf8(dat::Vector{UInt8}, options::Integer=0) | ||
local byt::UInt8, ch::UInt32, surr::UInt32 | ||
flags::UInt = 0 | ||
totalchar = num2byte = num3byte = num4byte = 0 | ||
pos = 0 | ||
len = sizeof(dat) | ||
@inbounds while pos < len | ||
ch = dat[pos += 1] | ||
totalchar += 1 | ||
if ch > 0x7f | ||
# Check UTF-8 encoding | ||
if ch < 0xe0 | ||
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) | ||
(pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) | ||
ch = get_continuation(ch & 0x3f, dat[pos += 1], pos) | ||
if ch > 0x7f | ||
num2byte += 1 | ||
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 | ||
elseif (options & UTF_ACCEPT_LONG) != 0 | ||
flags |= UTF_LONG | ||
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0) | ||
flags |= UTF_LONG | ||
else | ||
utf_errfunc(UTF_ERR_LONG, pos, ch) | ||
end | ||
elseif ch < 0xf0 | ||
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) | ||
(pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) | ||
ch = get_continuation(ch & 0x0f, dat[pos += 1], pos) | ||
ch = get_continuation(ch, dat[pos += 1], pos) | ||
# check for surrogate pairs, make sure correct | ||
if is_surrogate_codeunit(ch) | ||
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) | ||
# next character *must* be a trailing surrogate character | ||
(pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) | ||
byt = dat[pos += 1] | ||
(byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) | ||
surr = get_continuation(0x0000d, dat[pos += 1], pos) | ||
surr = get_continuation(surr, dat[pos += 1], pos) | ||
!is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr) | ||
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr) | ||
flags |= UTF_SURROGATE | ||
num4byte += 1 | ||
elseif ch > 0x07ff | ||
num3byte += 1 | ||
elseif (options & UTF_ACCEPT_LONG) != 0 | ||
flags |= UTF_LONG | ||
num2byte += 1 | ||
else | ||
utf_errfunc(UTF_ERR_LONG, pos-2, ch) | ||
end | ||
elseif ch < 0xf5 | ||
# 4-byte UTF-8 sequence (i.e. characters > 0xffff) | ||
(pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) | ||
ch = get_continuation(ch & 0x07, dat[pos += 1], pos) | ||
ch = get_continuation(ch, dat[pos += 1], pos) | ||
ch = get_continuation(ch, dat[pos += 1], pos) | ||
if ch > 0x10ffff | ||
utf_errfunc(UTF_ERR_INVALID, pos-3, ch) | ||
elseif ch > 0xffff | ||
num4byte += 1 | ||
elseif is_surrogate_codeunit(ch) | ||
utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) | ||
elseif (options & UTF_ACCEPT_LONG) != 0 | ||
# This is an overly long encode character | ||
flags |= UTF_LONG | ||
if ch > 0x7ff | ||
num3byte += 1 | ||
elseif ch > 0x7f | ||
num2byte += 1 | ||
end | ||
else | ||
utf_errfunc(UTF_ERR_LONG, pos-2, ch) | ||
end | ||
else | ||
utf_errfunc(UTF_ERR_INVALID, pos, ch) | ||
end | ||
end | ||
end | ||
num3byte != 0 && (flags |= UTF_UNICODE3) | ||
num4byte != 0 && (flags |= UTF_UNICODE4) | ||
return totalchar, flags, num4byte, num3byte, num2byte | ||
end | ||
|
||
#= | ||
@doc """ | ||
@brief Validates and calculates number of characters in a UTF-16 encoded vector of UInt16 | ||
@param[in] dat Vector{UInt16} | ||
@param[in] options flags to determine error handling (default 0) | ||
@return (total characters, flags, 4-byte, 3-byte, 2-byte) | ||
@throws ArgumentError | ||
""" -> | ||
=# | ||
function check_string_utf16(dat::Vector{UInt16}, len::Int) | ||
local ch::UInt32 | ||
flags::UInt = 0 | ||
totalchar = num2byte = num3byte = num4byte = 0 | ||
pos = 0 | ||
@inbounds while pos < len | ||
ch = dat[pos += 1] | ||
totalchar += 1 | ||
if ch > 0x7f | ||
if ch < 0x100 | ||
num2byte += 1 | ||
flags |= UTF_LATIN1 | ||
elseif ch < 0x800 | ||
num2byte += 1 | ||
flags |= UTF_UNICODE2 | ||
elseif !is_surrogate_codeunit(ch) | ||
num3byte += 1 | ||
elseif is_surrogate_lead(ch) | ||
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | ||
# next character *must* be a trailing surrogate character | ||
ch = dat[pos += 1] | ||
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | ||
num4byte += 1 | ||
else | ||
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | ||
end | ||
end | ||
end | ||
num3byte != 0 && (flags |= UTF_UNICODE3) | ||
num4byte != 0 && (flags |= UTF_UNICODE4) | ||
return totalchar, flags, num4byte, num3byte, num2byte | ||
end | ||
|
||
#= | ||
@doc """ | ||
@brief Validates and calculates number of characters in a UTF-32 encoded vector of UInt32 | ||
@param[in] dat Vector{UInt32} | ||
@param[in] options flags to determine error handling (default 0) | ||
@return (total characters, flags, 4-byte, 3-byte, 2-byte) | ||
@throws ArgumentError | ||
""" -> | ||
=# | ||
function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0) | ||
local ch::UInt32 | ||
flags::UInt = 0 | ||
totalchar = num2byte = num3byte = num4byte = 0 | ||
pos = 0 | ||
@inbounds while pos < len | ||
ch = dat[pos += 1] | ||
totalchar += 1 | ||
if ch > 0x7f | ||
if ch < 0x100 | ||
num2byte += 1 | ||
flags |= UTF_LATIN1 | ||
elseif ch < 0x800 | ||
num2byte += 1 | ||
flags |= UTF_UNICODE2 | ||
elseif ch > 0xffff | ||
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) | ||
num4byte += 1 | ||
elseif !is_surrogate_codeunit(ch) | ||
num3byte += 1 | ||
elseif is_surrogate_lead(ch) | ||
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | ||
# next character *must* be a trailing surrogate character | ||
ch = dat[pos += 1] | ||
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | ||
num4byte += 1 | ||
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) | ||
flags |= UTF_SURROGATE | ||
else | ||
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | ||
end | ||
end | ||
end | ||
num3byte != 0 && (flags |= UTF_UNICODE3) | ||
num4byte != 0 && (flags |= UTF_UNICODE4) | ||
return totalchar, flags, num4byte, num3byte, num2byte | ||
end | ||
|
||
function check_string_abs(str::AbstractString, options::Integer=0) | ||
local ch::UInt32 | ||
flags::UInt = 0 | ||
totalchar = num2byte = num3byte = num4byte = 0 | ||
pos = start(str) | ||
len = endof(str) | ||
@inbounds while pos < len | ||
ch, pos = next(str, pos) | ||
totalchar += 1 | ||
if ch > 0x7f | ||
if ch < 0x100 | ||
num2byte += 1 | ||
flags |= UTF_LATIN1 | ||
elseif ch < 0x800 | ||
num2byte += 1 | ||
flags |= UTF_UNICODE2 | ||
elseif ch > 0xffff | ||
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) | ||
num4byte += 1 | ||
elseif !is_surrogate_codeunit(ch) | ||
num3byte += 1 | ||
elseif is_surrogate_lead(ch) | ||
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | ||
# next character *must* be a trailing surrogate character | ||
ch, pos = next(str, pos) | ||
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | ||
num4byte += 1 | ||
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) | ||
flags |= UTF_SURROGATE | ||
else | ||
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | ||
end | ||
end | ||
end | ||
num3byte != 0 && (flags |= UTF_UNICODE3) | ||
num4byte != 0 && (flags |= UTF_UNICODE4) | ||
return totalchar, flags, num4byte, num3byte, num2byte | ||
end |
Oops, something went wrong.