Skip to content

Commit

Permalink
Merge pull request #23 from davidanthoff/datatable-dataframe
Browse files Browse the repository at this point in the history
Add support for DataTables and DataFrames
  • Loading branch information
evanmiller authored Mar 15, 2017
2 parents 1569d9a + 7a66d27 commit 429ddf8
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 31 deletions.
25 changes: 22 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ ReadStat.jl: Read files from Stata, SPSS, and SAS

The ReadStat.jl Julia module uses the
[ReadStat](https://github.com/WizardMac/ReadStat) C library to parse binary and
transport files from Stata, SPSS and SAS. All functions return a
[DataFrame](https://github.com/JuliaStats/DataFrames.jl).
transport files from Stata, SPSS and SAS. All functions return either a
[DataFrame](https://github.com/JuliaStats/DataFrames.jl) (default) or a
[DataTable](https://github.com/JuliaData/DataTables.jl).

Usage:

```julia
using ReadStat
using ReadStat, DataTables, DataFrames

read_dta("/path/to/something.dta")

Expand All @@ -22,4 +23,22 @@ read_por("/path/to/something.por")
read_sav("/path/to/something.sav")

read_sas7bdat("/path/to/something.sas7bdat")

read_dta(DataTable, "/path/to/something.dta")

read_por(DataTable, "/path/to/something.por")

read_sav(DataTable, "/path/to/something.sav")

read_sas7bdat(DataTable, "/path/to/something.sas7bdat")

read_dta(DataFrame, "/path/to/something.dta")

read_por(DataFrame, "/path/to/something.por")

read_sav(DataFrame, "/path/to/something.sav")

read_sas7bdat(DataFrame, "/path/to/something.sas7bdat")


```
2 changes: 2 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
julia 0.5
DataArrays
NullableArrays
DataFrames
DataTables
BinDeps
@windows WinRPM
44 changes: 37 additions & 7 deletions src/ReadStat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ end
##
##############################################################################

using NullableArrays
import NullableArrays: NullableArray, NullableVector
import DataArrays: DataArray, DataVector
import DataFrames: DataFrame
import DataTables: DataTable
export read_dta, read_sav, read_por, read_sas7bdat

##############################################################################
Expand Down Expand Up @@ -96,9 +98,11 @@ type ReadStatDataFrame
header::Vector{Symbol}
types::Vector{DataType}
rows::Int
storage_type::DataType
end
ReadStatDataFrame() = ReadStatDataFrame(Any[], Symbol[], DataType[], 0)
ReadStatDataFrame(destination_type::DataType) = ReadStatDataFrame(Any[], Symbol[], DataType[], 0, destination_type)
DataFrame(ds::ReadStatDataFrame) = DataFrame(ds.data, ds.header)
DataTable(ds::ReadStatDataFrame) = DataTable(ds.data, ds.header)

##############################################################################
##
Expand Down Expand Up @@ -137,7 +141,7 @@ function handle_variable!(var_index::Cint, variable::Ptr{ReadStatVariable},
end
push!(ds.types, jtype)

push!(ds.data, NullableArray(jtype, ds.rows))
push!(ds.data, ds.storage_type(jtype, ds.rows))

return Cint(0)
end
Expand All @@ -155,6 +159,23 @@ function handle_value!(obs_index::Cint, var_index::Cint,
return Cint(0)
end

function readfield!(dest::DataVector{String}, row, col, val)
val = unsafe_string(reinterpret(Ptr{Int8}, val % Csize_t))
@inbounds dest.data[row], dest.na[row] = val, false
end

function readfield!{T <: Integer}(dest::DataVector{T}, row, col, val)
@inbounds dest.data[row], dest.na[row] = val, false
end

function readfield!(dest::DataVector{Float64}, row, col, val)
@inbounds dest.data[row], dest.na[row] = reinterpret(Float64, val), false
end

function readfield!(dest::DataVector{Float32}, row, col, val)
@inbounds dest.data[row], dest.na[row] = reinterpret(Float32, val % Int32) , false
end

function readfield!(dest::NullableVector{String}, row, col, val)
val = unsafe_string(reinterpret(Ptr{Int8}, val % Csize_t))
@inbounds dest.values[row], dest.isnull[row] = val, false
Expand All @@ -176,15 +197,23 @@ function handle_value_label!(val_labels::Cstring, value::ReadStatValue, label::C
return Cint(0)
end

function read_data_file(filename::AbstractString, filetype::Type)
function get_default_storage{T<:DataFrame}(::Type{T})
return DataArray
end

function get_default_storage{T<:DataTable}(::Type{T})
return NullableArray
end

function read_data_file{T}(::Type{T}, filename::AbstractString, filetype::Type)
# initialize ds
ds = ReadStatDataFrame()
ds = ReadStatDataFrame(get_default_storage(T))
# initialize parser
parser = Parser()
# parse
parse_data_file!(ds, parser, filename, filetype)
# return dataframe instead of ReadStatDataFrame
return DataFrame(convert(Vector{Any},ds.data), Symbol[Symbol(x) for x in ds.header])
return T(ds)
end

function Parser()
Expand Down Expand Up @@ -218,7 +247,8 @@ end
for f in (:dta, :sav, :por, :sas7bdat)
valtype = Val{f}
# define read_dta that calls read(.., val{:dta}))
@eval $(Symbol(:read_, f))(filename::AbstractString) = read_data_file(filename, $valtype)
@eval $(Symbol(:read_, f))(filename::AbstractString) = read_data_file(DataFrame, filename, $valtype)
@eval $(Symbol(:read_, f)){T}(::Type{T}, filename::AbstractString) = read_data_file(T, filename, $valtype)
end


Expand Down
90 changes: 69 additions & 21 deletions test/read_dta.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,75 @@
using NullableArrays, Base.Test
using DataArrays, NullableArrays, DataTables, DataFrames, Base.Test

dtafile = joinpath(dirname(@__FILE__), "types.dta")

# Test default return container type

df = read_dta(dtafile)
@test typeof(df[:, :vfloat]) == NullableVector{Float32}
@test typeof(df[:, :vdouble]) == NullableVector{Float64}
@test typeof(df[:, :vlong]) == NullableVector{Int32}
@test typeof(df[:, :vint]) == NullableVector{Int16}
@test typeof(df[:, :vbyte]) == NullableVector{Int8}
@test typeof(df[:, :vstring]) == NullableVector{String}

@test get(df[2, :vfloat]) == 7
@test get(df[2, :vdouble]) == 7
@test get(df[2, :vlong]) == 7
@test get(df[2, :vint]) == 7
@test get(df[2, :vbyte]) == 7
@test get(df[2, :vstring]) == "7"

@test isnull(df[3, :vfloat])
@test isnull(df[3, :vdouble])
@test isnull(df[3, :vlong])
@test isnull(df[3, :vint])
@test isnull(df[3, :vbyte])
@test get(df[3, :vstring]) == ""
@test typeof(df[:, :vfloat]) == DataVector{Float32}
@test typeof(df[:, :vdouble]) == DataVector{Float64}
@test typeof(df[:, :vlong]) == DataVector{Int32}
@test typeof(df[:, :vint]) == DataVector{Int16}
@test typeof(df[:, :vbyte]) == DataVector{Int8}
@test typeof(df[:, :vstring]) == DataVector{String}

@test df[2, :vfloat] == 7
@test df[2, :vdouble] == 7
@test df[2, :vlong] == 7
@test df[2, :vint] == 7
@test df[2, :vbyte] == 7
@test df[2, :vstring] == "7"

@test isna(df[3, :vfloat])
@test isna(df[3, :vdouble])
@test isna(df[3, :vlong])
@test isna(df[3, :vint])
@test isna(df[3, :vbyte])
@test df[3, :vstring] == ""

# Test explicit DataFrame return container type

df = read_dta(DataFrame, dtafile)
@test typeof(df[:, :vfloat]) == DataVector{Float32}
@test typeof(df[:, :vdouble]) == DataVector{Float64}
@test typeof(df[:, :vlong]) == DataVector{Int32}
@test typeof(df[:, :vint]) == DataVector{Int16}
@test typeof(df[:, :vbyte]) == DataVector{Int8}
@test typeof(df[:, :vstring]) == DataVector{String}

@test df[2, :vfloat] == 7
@test df[2, :vdouble] == 7
@test df[2, :vlong] == 7
@test df[2, :vint] == 7
@test df[2, :vbyte] == 7
@test df[2, :vstring] == "7"

@test isna(df[3, :vfloat])
@test isna(df[3, :vdouble])
@test isna(df[3, :vlong])
@test isna(df[3, :vint])
@test isna(df[3, :vbyte])
@test df[3, :vstring] == ""

# Test explicit DataTable return container type

dt = read_dta(DataTable, dtafile)
@test typeof(dt[:, :vfloat]) == NullableVector{Float32}
@test typeof(dt[:, :vdouble]) == NullableVector{Float64}
@test typeof(dt[:, :vlong]) == NullableVector{Int32}
@test typeof(dt[:, :vint]) == NullableVector{Int16}
@test typeof(dt[:, :vbyte]) == NullableVector{Int8}
@test typeof(dt[:, :vstring]) == NullableVector{String}

@test get(dt[2, :vfloat]) == 7
@test get(dt[2, :vdouble]) == 7
@test get(dt[2, :vlong]) == 7
@test get(dt[2, :vint]) == 7
@test get(dt[2, :vbyte]) == 7
@test get(dt[2, :vstring]) == "7"

@test isnull(dt[3, :vfloat])
@test isnull(dt[3, :vdouble])
@test isnull(dt[3, :vlong])
@test isnull(dt[3, :vint])
@test isnull(dt[3, :vbyte])
@test get(dt[3, :vstring]) == ""

0 comments on commit 429ddf8

Please sign in to comment.