Skip to content

Commit

Permalink
Merge pull request #75 from andreasnoack/master
Browse files Browse the repository at this point in the history
Support SAS XPORT
  • Loading branch information
davidanthoff authored Apr 22, 2021
2 parents 28904d7 + 876cbc5 commit 63f43e3
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 142 deletions.
20 changes: 12 additions & 8 deletions src/C_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ function readstat_get_modified_time(metadata::Ptr{Nothing})
end

function readstat_get_file_format_version(metadata::Ptr{Nothing})
return ccall((:readstat_get_file_format_version, libreadstat), UInt, (Ptr{Nothing},), metadata)
return ccall((:readstat_get_file_format_version, libreadstat), Cint, (Ptr{Nothing},), metadata)
end

function readstat_get_row_count(metadata::Ptr{Nothing})
return ccall((:readstat_get_row_count, libreadstat), UInt, (Ptr{Nothing},), metadata)
return ccall((:readstat_get_row_count, libreadstat), Cint, (Ptr{Nothing},), metadata)
end

function readstat_get_var_count(metadata::Ptr{Nothing})
return ccall((:readstat_get_var_count, libreadstat), UInt, (Ptr{Nothing},), metadata)
return ccall((:readstat_get_var_count, libreadstat), Cint, (Ptr{Nothing},), metadata)
end

function readstat_value_is_missing(value::ReadStatValue, variable::Ptr{Nothing})
return ccall((:readstat_value_is_missing, libreadstat), Bool, (ReadStatValue,Ptr{Nothing}), value, variable)
return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue,Ptr{Nothing}), value, variable))
end

function readstat_variable_get_index(variable::Ptr{Nothing})
Expand Down Expand Up @@ -56,19 +56,23 @@ function readstat_value_type(val::Value)
end

function readstat_parse(filename::String, type::Val{:dta}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_dta, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_dta, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:sav}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_sav, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_sav, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:por}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_por, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_por, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:sas7bdat}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_sas7bdat, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_sas7bdat, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:xport}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_xport, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_variable_get_missing_ranges_count(variable::Ptr{Nothing})
Expand Down
133 changes: 49 additions & 84 deletions src/ReadStat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ using DataValues: DataValueVector
import DataValues
using Dates

export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat
export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport

##############################################################################
##
Expand Down Expand Up @@ -72,7 +72,7 @@ mutable struct ReadStatDataFrame
types_as_int::Vector{Cint}
hasmissings::Vector{Bool}

ReadStatDataFrame() =
ReadStatDataFrame() =
new(Any[], Symbol[], DataType[], String[], String[], Csize_t[], Cint[], Cint[],
String[], Dict{String, Dict{Any,String}}(), 0, 0, "", Dates.unix2datetime(0), 0, Cint[], Bool[])
end
Expand Down Expand Up @@ -114,9 +114,7 @@ function get_format(var::Ptr{Nothing})
ptr == C_NULL ? "" : unsafe_string(ptr)
end

function get_type(variable::Ptr{Nothing})
data_type = readstat_variable_get_type(variable)

function get_type(data_type::Cint)
if data_type == READSTAT_TYPE_STRING
return String
elseif data_type == READSTAT_TYPE_CHAR
Expand All @@ -132,18 +130,18 @@ function get_type(variable::Ptr{Nothing})
end
return Nothing
end
get_type(variable::Ptr{Nothing}) = get_type(readstat_variable_get_type(variable))

get_storagewidth(variable::Ptr{Nothing}) = readstat_variable_get_storage_width(variable)

get_measure(variable::Ptr{Nothing}) = readstat_variable_get_measure(variable)

get_alignment(variable::Ptr{Nothing}) = readstat_variable_get_measure(variable)

function handle_variable!(var_index::Cint, variable::Ptr{Nothing},
function handle_variable!(var_index::Cint, variable::Ptr{Nothing},
val_label::Cstring, ds_ptr::Ptr{ReadStatDataFrame})
col = var_index + 1
ds = unsafe_pointer_to_objref(ds_ptr)::ReadStatDataFrame

missing_count = readstat_variable_get_missing_ranges_count(variable)

push!(ds.val_label_keys, (val_label == C_NULL ? "" : unsafe_string(val_label)))
Expand All @@ -154,11 +152,16 @@ function handle_variable!(var_index::Cint, variable::Ptr{Nothing},
push!(ds.types, jtype)
push!(ds.types_as_int, readstat_variable_get_type(variable))
push!(ds.hasmissings, missing_count > 0)
push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, ds.rows), fill(false, ds.rows)))
# SAS XPORT sets ds.rows == -1
if ds.rows >= 0
push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, ds.rows), fill(false, ds.rows)))
else
push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, 0), fill(false, 0)))
end
push!(ds.storagewidths, get_storagewidth(variable))
push!(ds.measures, get_measure(variable))
push!(ds.alignments, get_alignment(variable))

return Cint(0)
end

Expand Down Expand Up @@ -191,100 +194,61 @@ function handle_value!(obs_index::Cint, variable::Ptr{Nothing},
readstat_value_is_missing(value, C_NULL)
end

if type_as_int==READSTAT_TYPE_DOUBLE
col_float64 = data[var_index]::DataValueVector{Float64}

if ismissing
DataValues.unsafe_setindex_isna!(col_float64, true, obs_index + 1)
else
readfield!(col_float64, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_INT32
col_int32 = data[var_index]::DataValueVector{Int32}

if ismissing
DataValues.unsafe_setindex_isna!(col_int32, true, obs_index + 1)
else
readfield!(col_int32, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_STRING
col_string = data[var_index]::DataValueVector{String}

if ismissing
DataValues.unsafe_setindex_isna!(col_string, true, obs_index + 1)
else
readfield!(col_string, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_CHAR
col_int8 = data[var_index]::DataValueVector{Int8}

if ismissing
DataValues.unsafe_setindex_isna!(col_int8, true, obs_index + 1)
else
readfield!(col_int8, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_INT16
col_int16 = data[var_index]::DataValueVector{Int16}

if ismissing
DataValues.unsafe_setindex_isna!(col_int16, true, obs_index + 1)
else
readfield!(col_int16, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_FLOAT
col_float32 = data[var_index]::DataValueVector{Float32}

if ismissing
DataValues.unsafe_setindex_isna!(col_float32, true, obs_index + 1)
else
readfield!(col_float32, obs_index + 1, value)
end
else
col_untyped = data[var_index]
col = data[var_index]
@assert eltype(eltype(col)) == get_type(type_as_int)

if ismissing
DataValues.unsafe_setindex_isna!(col_untyped, true, obs_index + 1)
if ismissing
if obs_index < length(col)
DataValues.unsafe_setindex_isna!(col, true, obs_index + 1)
else
readfield!(col_untyped, obs_index + 1, value)
push!(col, DataValues.NA)
end
else
readfield!(col, obs_index + 1, value)
end

return Cint(0)
end

function readfield!(dest::DataValueVector{String}, row, val::ReadStatValue)
ptr = ccall((:readstat_string_value, libreadstat), Cstring, (ReadStatValue,), val)
if ptr C_NULL
@inbounds DataValues.unsafe_setindex_value!(dest, unsafe_string(ptr), row)
end
end

function readfield!(dest::DataValueVector{Int8}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int8_value, libreadstat), Int8, (ReadStatValue,), val), row)
end

function readfield!(dest::DataValueVector{Int16}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int16_value, libreadstat), Int16, (ReadStatValue,), val), row)
end

function readfield!(dest::DataValueVector{Int32}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int32_value, libreadstat), Int32, (ReadStatValue,), val), row)
end

function readfield!(dest::DataValueVector{Float64}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_double_value, libreadstat), Float64, (ReadStatValue,), val), row)
if row <= length(dest)
if ptr C_NULL
@inbounds DataValues.unsafe_setindex_value!(dest, unsafe_string(ptr), row)
end
elseif row == length(dest) + 1
_val = ptr C_NULL ? unsafe_string(ptr) : ""
DataValues.push!(dest, _val)
else
throw(ArgumentError("illegal row index: $row"))
end
end

function readfield!(dest::DataValueVector{Float32}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_float_value, libreadstat), Float32, (ReadStatValue,), val), row)
for (j_type, rs_name) in (
(Int8, :readstat_int8_value),
(Int16, :readstat_int16_value),
(Int32, :readstat_int32_value),
(Float32, :readstat_float_value),
(Float64, :readstat_double_value))
@eval function readfield!(dest::DataValueVector{$j_type}, row, val::ReadStatValue)
_val = ccall(($(QuoteNode(rs_name)), libreadstat), $j_type, (ReadStatValue,), val)
if row <= length(dest)
@inbounds DataValues.unsafe_setindex_value!(dest, _val, row)
elseif row == length(dest) + 1
DataValues.push!(dest, _val)
else
throw(ArgumentError("illegal row index: $row"))
end
end
end

function handle_value_label!(val_labels::Cstring, value::Value, label::Cstring, ds_ptr::Ptr{ReadStatDataFrame})
val_labels C_NULL || return Cint(0)
ds = unsafe_pointer_to_objref(ds_ptr)
dict = get!(ds.val_label_dict, unsafe_string(val_labels), Dict{Any,String}())
dict[as_native(value)] = unsafe_string(label)

return Cint(0)
end

Expand All @@ -311,7 +275,7 @@ function Parser()
ccall((:readstat_set_value_handler, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), parser, val_fxn)
ccall((:readstat_set_value_label_handler, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), parser, label_fxn)
return parser
end
end

function parse_data_file!(ds::ReadStatDataFrame, parser::Ptr{Nothing}, filename::AbstractString, filetype::Val)
retval = readstat_parse(filename, filetype, parser, ds)
Expand All @@ -323,5 +287,6 @@ read_dta(filename::AbstractString) = read_data_file(filename, Val(:dta))
read_sav(filename::AbstractString) = read_data_file(filename, Val(:sav))
read_por(filename::AbstractString) = read_data_file(filename, Val(:por))
read_sas7bdat(filename::AbstractString) = read_data_file(filename, Val(:sas7bdat))
read_xport(filename::AbstractString) = read_data_file(filename, Val(:xport))

end #module ReadStat
68 changes: 18 additions & 50 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,22 @@ using ReadStat
using DataValues
using Test

@testset "ReadStat" begin

@testset "DTA files" begin

dtafile = joinpath(dirname(@__FILE__), "types.dta")
rsdf = read_dta(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "SAV files" begin

dtafile = joinpath(dirname(@__FILE__), "types.sav")
rsdf = read_sav(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "SAS7BDAT files" begin

dtafile = joinpath(dirname(@__FILE__), "types.sas7bdat")
rsdf = read_sas7bdat(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "ReadStat: $ext files" for (reader, ext) in
((read_dta, "dta"),
(read_sav, "sav"),
(read_sas7bdat, "sas7bdat"),
(read_xport, "xpt"))

dtafile = joinpath(dirname(@__FILE__), "types.$ext")
rsdf = reader(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end
Binary file added test/types.xpt
Binary file not shown.

0 comments on commit 63f43e3

Please sign in to comment.