Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to DataFrames v0.11 (CategoricalArrays + Missings) #28

Merged
merged 4 commits into from
Nov 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
## RData v0.3.0 Release Notes

Updated to DataFrames v0.11, switched from [DataArrays](https://github.com/JuliaData/DataArrays.jl) to [Missings](https://github.com/JuliaData/Missings.jl) and [CategoricalArrays](https://github.com/JuliaData/CategoricalArrays.jl).

##### Changes
* updated to DataFrames v0.11 [#28]
* switched from `DataVector` to `Vector{Union{T, Missing}}` for NAs [#28]
* R factors converted into `CategoricalVector` (instead of `PooledDataArray`) [#28]

[#28]: https://github.com/JuliaStats/RData.jl/issues/28

## RData v0.2.0 Release Notes

Updated to Julia v0.6 (older versions not supported).
Expand Down
5 changes: 3 additions & 2 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
julia 0.6
DataFrames 0.9
DataArrays 0.4
DataFrames 0.11
Missings 0.2
CategoricalArrays 0.3
FileIO 0.1.2
CodecZlib 0.4
2 changes: 1 addition & 1 deletion src/RData.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ __precompile__()

module RData

using DataFrames, DataArrays, CodecZlib, FileIO
using DataFrames, CategoricalArrays, Missings, CodecZlib, FileIO
import DataFrames: identifier
import FileIO: load

Expand Down
116 changes: 79 additions & 37 deletions src/convert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,83 +3,125 @@

function Base.convert(::Type{Hash}, pl::RPairList)
res = Hash()
for i in 1:length(pl.items)
setindex!(res, pl.items[i], pl.tags[i])
for i in eachindex(pl.items)
@inbounds setindex!(res, pl.items[i], pl.tags[i])
end
res
end

##############################################################################
##
## Conversion of intermediate R objects into DataArray and DataFrame objects
## Conversion of intermediate R objects into Vector{T} and DataFrame objects
##
##############################################################################

namask(rl::RLogicalVector) = BitArray(rl.data .== R_NA_INT32)
namask(ri::RIntegerVector) = BitArray(ri.data .== R_NA_INT32)
namask(rn::RNumericVector) = BitArray(map(isna_float64, reinterpret(UInt64, rn.data)))
isna(x::Int32) = x == R_NA_INT32
isna(x::Float64) = isna_float64(reinterpret(UInt64, x))
# if re or im is NA, the whole complex number is NA
# FIXME avoid temporary Vector{Bool}
namask(rc::RComplexVector) = BitArray([isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)])
namask(rv::RNullableVector) = rv.na
isna(x::Complex128) = isna(real(x)) || isna(imag(x))

DataArrays.data(rv::RVEC) = DataArray(rv.data, namask(rv))
# convert R vector into Vector holding elements of type T
# if force_missing is true, the result is always Vector{Union{T,Missing}},
# otherwise it's Vector{T} if `rv` doesn't contain NAs
function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T
anyna = any(isna, rv.data)
if force_missing || anyna
res = convert(Vector{Union{T,Missing}}, rv.data)
if anyna
@inbounds for (i,x) in enumerate(rv.data)
isna(x) && (res[i] = missing)
end
end
return res
else
return convert(Vector{T}, rv.data)
end
end

# convert R nullable vector (has an explicit NA mask) into Vector{T[?]}
function jlvec(::Type{T}, rv::RNullableVector{R}, force_missing::Bool=true) where {T, R}
anyna = any(rv.na)
if force_missing || anyna
res = convert(Vector{Union{T,Missing}}, rv.data)
anyna && @inbounds res[rv.na] = missing
return res
else
return convert(Vector{T}, rv.data)
end
end

# convert R vector into Vector of appropriate type
jlvec(rv::RVEC, force_missing::Bool=true) = jlvec(eltype(rv.data), rv, force_missing)

function DataArrays.data(ri::RIntegerVector)
if !isfactor(ri) return DataArray(ri.data, namask(ri)) end
# convert factor into PooledDataArray
pool = getattr(ri, "levels", emptystrvec)
sz = length(pool)
# convert R logical vector (uses Int32 to store values) into Vector{Bool[?]}
function jlvec(rl::RLogicalVector, force_missing::Bool=true)
anyna = any(isna, rl.data)
if force_missing || anyna
return Union{Bool,Missing}[ifelse(isna(x), missing, x != 0) for x in rl.data]
else
return Bool[x != 0 for x in rl.data]
end
end

# kernel method that converts Vector{Int32} into Vector{R} replacing R_NA_INT32 with 0
# it's assumed that v fits into R
na2zero(::Type{R}, v::Vector{Int32}) where R =
[ifelse(!isna(x), x % R, zero(R)) for x in v]

# convert to CategoricalVector{String[?]} if `ri` is a factor,
# or to Vector{Int32[?]} otherwise
function jlvec(ri::RIntegerVector, force_missing::Bool=true)
isfactor(ri) || return jlvec(eltype(ri.data), ri, force_missing)

rlevels = getattr(ri, "levels", emptystrvec)
sz = length(rlevels)
REFTYPE = sz <= typemax(UInt8) ? UInt8 :
sz <= typemax(UInt16) ? UInt16 :
sz <= typemax(UInt32) ? UInt32 :
UInt64
dd = ri.data
dd[namask(ri)] = 0
refs = convert(Vector{REFTYPE}, dd)
return PooledDataArray(DataArrays.RefArray(refs), pool)
# FIXME set ordered flag
refs = na2zero(REFTYPE, ri.data)
anyna = any(iszero, refs)
pool = CategoricalPool{String, REFTYPE}(rlevels)
if force_missing || anyna
return CategoricalArray{Union{String, Missing}, 1}(refs, pool)
else
return CategoricalArray{String, 1}(refs, pool)
end
end

# convert R logical vector (uses Int32 to store values) into DataVector{Bool}
DataArrays.data(rl::RLogicalVector) =
return DataArray(Bool[x != 0 for x in rl.data], namask(rl))

function sexp2julia(rex::RSEXPREC)
warn("Conversion of $(typeof(rex)) to Julia is not implemented")
return nothing
end

function sexp2julia(rv::RVEC)
# FIXME dimnames
# FIXME forceDataArrays option to always convert to DataArray
nas = namask(rv)
hasna = any(nas)
# TODO dimnames?
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
jv = jlvec(rv, false)
if hasnames(rv)
# if data has no NA, convert to simple Vector
return DictoVec(hasna ? DataArray(rv.data, nas) : rv.data, names(rv))
return DictoVec(jv, names(rv))
else
hasdims = hasdim(rv)
if !hasdims && length(rv.data)==1
# scalar
# FIXME handle NAs
# if hasna
return rv.data[1]
return jv[1]
elseif !hasdims
# vectors
return hasna ? DataArray(rv.data, nas) : rv.data
return jv
else
# matrices and so on
dims = tuple(convert(Vector{Int64}, getattr(rv, "dim"))...)
return hasna ? DataArray(reshape(rv.data, dims), reshape(nas, dims)) :
reshape(rv.data, dims)
dims = tuple(convert(Vector{Int}, getattr(rv, "dim"))...)
return reshape(jv, dims)
end
end
end

function sexp2julia(rl::RList)
if isdataframe(rl)
# FIXME remove Any type assertion workaround
DataFrame(Any[data(col) for col in rl.data], map(identifier, names(rl)))
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
DataFrame(Any[jlvec(col, false) for col in rl.data], identifier.(names(rl)))
elseif hasnames(rl)
DictoVec(Any[sexp2julia(item) for item in rl.data], names(rl))
else
Expand Down
17 changes: 9 additions & 8 deletions src/io/ASCIIIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ASCII RData format IO stream wrapper.
struct ASCIIIO{T<:IO} <: RDAIO
sub::T # underlying IO stream

(::Type{ASCIIIO})(io::T) where {T<:IO} = new{T}(io)
ASCIIIO(io::T) where {T<:IO} = new{T}(io)
end

readint32(io::ASCIIIO) = parse(Int32, readline(io.sub))
Expand All @@ -24,20 +24,21 @@ readintorNA(io::ASCIIIO, n::RVecLength) = Int32[readintorNA(io) for i in 1:n]
# str == R_NA_STRING ? R_NA_FLOAT64 : parse(Float64, str)
#end

function readfloatorNA(io::ASCIIIO, n::RVecLength)
res = Vector{Float64}(n)
res_uint = reinterpret(UInt64, res) # alias of res for setting NA
@inbounds for i in 1:n
function readfloatorNA!(io::ASCIIIO, v::AbstractVector{Float64})
v_uint = reinterpret(UInt64, v) # alias of res for setting NA
@inbounds for i in eachindex(v)
str = chomp(readline(io.sub))
if str != R_NA_STRING
res[i] = parse(Float64, str)
v[i] = parse(Float64, str)
else
res_uint[i] = R_NA_FLOAT64 # see JuliaStats/RData.jl#5
v_uint[i] = R_NA_FLOAT64 # see JuliaStats/RData.jl#5
end
end
res
v
end

readfloatorNA(io::ASCIIIO, n::RVecLength) = readfloatorNA!(io, Vector{Float64}(n))

readuint8(io::ASCIIIO, n::RVecLength) = UInt8[hex2bytes(chomp(readline(io.sub)))[1] for i in 1:n] # FIXME optimize for speed

function readnchars(io::ASCIIIO, n::Int32) # reads N bytes-sized string
Expand Down
3 changes: 2 additions & 1 deletion src/io/NativeIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ TODO write readers
"""
struct NativeIO{T<:IO} <: RDAIO
sub::T # underlying IO stream
(::Type{NativeIO})(io::T) where {T<:IO} = new{T}(io)

NativeIO(io::T) where {T<:IO} = new{T}(io)
end
14 changes: 10 additions & 4 deletions src/io/XDRIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ XDR (machine-independent binary) RData format IO stream wrapper.
struct XDRIO{T<:IO} <: RDAIO
sub::T # underlying IO stream
buf::Vector{UInt8} # buffer for strings
(::Type{XDRIO})(io::T) where {T <: IO} = new{T}(io, Vector{UInt8}(1024))

XDRIO(io::T) where {T <: IO} = new{T}(io, Vector{UInt8}(1024))
end

readint32(io::XDRIO) = ntoh(read(io.sub, Int32))
readuint32(io::XDRIO) = ntoh(read(io.sub, UInt32))
readfloat64(io::XDRIO) = reinterpret(Float64, ntoh(read(io.sub, Int64)))
readfloat64(io::XDRIO) = ntoh(read(io.sub, Float64))

readintorNA(io::XDRIO) = readint32(io)
function readintorNA(io::XDRIO, n::RVecLength)
Expand All @@ -21,8 +22,13 @@ end
# R's NA is silently converted to NaN when the value is loaded in the register(?)
#readfloatorNA(io::XDRIO) = readfloat64(io)
function readfloatorNA(io::XDRIO, n::RVecLength)
v = read(io.sub, UInt64, n)
reinterpret(Float64, map!(ntoh, v, v))
v = read(io.sub, Float64, n)
map!(ntoh, v, v)
end

function readfloatorNA!(io::XDRIO, v::AbstractVector{Float64})
readbytes!(io.sub, reinterpret(UInt8, v))
map!(ntoh, v, v)
end

readuint8(io::XDRIO, n::RVecLength) = read(io.sub, UInt8, n)
Expand Down
5 changes: 3 additions & 2 deletions src/readers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ end
function readcomplex(ctx::RDAContext, fl::RDATag)
@assert sxtype(fl) == CPLXSXP
n = readlength(ctx.io)
RComplexVector(reinterpret(Complex128, readfloatorNA(ctx.io, 2n)),
readattrs(ctx, fl))
v = Vector{Complex128}(n)
readfloatorNA!(ctx.io, reinterpret(Float64, v))
RComplexVector(v, readattrs(ctx, fl))
end

function readstring(ctx::RDAContext, fl::RDATag)
Expand Down
2 changes: 1 addition & 1 deletion src/sxtypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ struct RVector{T, S} <: RVEC{T, S}
data::Vector{T}
attr::Hash # collection of R object attributes

(::Type{RVector{T,S}})(v::Vector{T}=T[], attr::Hash=Hash()) where {T,S} =
RVector{T,S}(v::Vector{T}=T[], attr::Hash=Hash()) where {T,S} =
new{T,S}(v, attr)
end

Expand Down
43 changes: 22 additions & 21 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,53 +6,54 @@ module TestRDA
# check for Float64 NA
@testset "Detect R floating-point NAs" begin
@test !RData.isna_float64(reinterpret(UInt64, 1.0))
@test !RData.isna_float64(reinterpret(UInt64, NaN))
@test !RData.isna_float64(reinterpret(UInt64, Inf))
@test !RData.isna_float64(reinterpret(UInt64, -Inf))
@test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64))
@test !RData.isna(1.0)
@test !RData.isna(NaN)
@test !RData.isna(Inf)
@test !RData.isna(-Inf)
@test RData.isna_float64(RData.R_NA_FLOAT64)
# check that alternative NA is also recognized (#10)
@test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1)))
end

testdir = dirname(@__FILE__)
@testset "Reading minimal RData" begin
df = DataFrame(num = [1.1, 2.2])
@test isequal(sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]), df)
@test isequal(load("$testdir/data/minimal.rda",convert=true)["df"], df)
@test isequal(load("$testdir/data/minimal_ascii.rda")["df"], df)
@test sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]) == df
@test load("$testdir/data/minimal.rda",convert=true)["df"] == df
@test load("$testdir/data/minimal_ascii.rda")["df"] == df
end

@testset "Conversion to Julia types" begin
df = DataFrame(num = [1.1, 2.2],
int = Int32[1, 2],
logi = [true, false],
chr = ["ab", "c"],
factor = pool(["ab", "c"]),
cplx = Complex128[1.1+0.5im, 1.0im])
factor = categorical(["ab", "c"], true),
cplx = [1.1+0.5im, 1.0im])
rdf = sexp2julia(load("$testdir/data/types.rda",convert=false)["df"])
@test eltypes(rdf) == eltypes(df)
@test isequal(rdf, df)
@test rdf == df
rdf_ascii = sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"])
@test eltypes(rdf_ascii) == eltypes(df)
@test isequal(rdf_ascii, df)
@test rdf_ascii == df
end

@testset "NAs conversion" begin
df = DataFrame(num = [1.1, 2.2],
int = Int32[1, 2],
logi = [true, false],
chr = ["ab", "c"],
factor = pool(["ab", "c"]),
cplx = Complex128[1.1+0.5im, 1.0im])
df = DataFrame(num = Union{Float64, Missing}[1.1, 2.2],
int = Union{Int32, Missing}[1, 2],
logi = Union{Bool, Missing}[true, false],
chr = Union{String, Missing}["ab", "c"],
factor = categorical(Union{String, Missing}["ab", "c"], true),
cplx = Union{Complex128, Missing}[1.1+0.5im, 1.0im])

df[2, :] = NA
df[2, :] = missing
append!(df, df[2, :])
df[3, :num] = NaN
df[:, :cplx] = @data [NA, Complex128(1,NaN), NaN]
df[:, :cplx] = [missing, Complex128(1,NaN), NaN]
@test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df)
# ASCII format saves NaN as NA
df[3, :num] = NA
df[:, :cplx] = @data [NA, NA, NA]
df[3, :num] = missing
df[:, :cplx] = missing
@test isequal(sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]), df)
end

Expand Down