Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support SAS XPORT #75

Merged
merged 2 commits into from
Apr 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/C_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ function readstat_get_modified_time(metadata::Ptr{Nothing})
end

function readstat_get_file_format_version(metadata::Ptr{Nothing})
return ccall((:readstat_get_file_format_version, libreadstat), UInt, (Ptr{Nothing},), metadata)
return ccall((:readstat_get_file_format_version, libreadstat), Cint, (Ptr{Nothing},), metadata)
end

function readstat_get_row_count(metadata::Ptr{Nothing})
return ccall((:readstat_get_row_count, libreadstat), UInt, (Ptr{Nothing},), metadata)
return ccall((:readstat_get_row_count, libreadstat), Cint, (Ptr{Nothing},), metadata)
end

function readstat_get_var_count(metadata::Ptr{Nothing})
return ccall((:readstat_get_var_count, libreadstat), UInt, (Ptr{Nothing},), metadata)
return ccall((:readstat_get_var_count, libreadstat), Cint, (Ptr{Nothing},), metadata)
end

function readstat_value_is_missing(value::ReadStatValue, variable::Ptr{Nothing})
return ccall((:readstat_value_is_missing, libreadstat), Bool, (ReadStatValue,Ptr{Nothing}), value, variable)
return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue,Ptr{Nothing}), value, variable))
end

function readstat_variable_get_index(variable::Ptr{Nothing})
Expand Down Expand Up @@ -56,19 +56,23 @@ function readstat_value_type(val::Value)
end

function readstat_parse(filename::String, type::Val{:dta}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_dta, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_dta, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:sav}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_sav, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_sav, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:por}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_por, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_por, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:sas7bdat}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_sas7bdat, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
return ccall((:readstat_parse_sas7bdat, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_parse(filename::String, type::Val{:xport}, parser::Ptr{Nothing}, ds::ReadStatDataFrame)
return ccall((:readstat_parse_xport, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds)
end

function readstat_variable_get_missing_ranges_count(variable::Ptr{Nothing})
Expand Down
133 changes: 49 additions & 84 deletions src/ReadStat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ using DataValues: DataValueVector
import DataValues
using Dates

export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat
export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport

##############################################################################
##
Expand Down Expand Up @@ -72,7 +72,7 @@ mutable struct ReadStatDataFrame
types_as_int::Vector{Cint}
hasmissings::Vector{Bool}

ReadStatDataFrame() =
ReadStatDataFrame() =
new(Any[], Symbol[], DataType[], String[], String[], Csize_t[], Cint[], Cint[],
String[], Dict{String, Dict{Any,String}}(), 0, 0, "", Dates.unix2datetime(0), 0, Cint[], Bool[])
end
Expand Down Expand Up @@ -114,9 +114,7 @@ function get_format(var::Ptr{Nothing})
ptr == C_NULL ? "" : unsafe_string(ptr)
end

function get_type(variable::Ptr{Nothing})
data_type = readstat_variable_get_type(variable)

function get_type(data_type::Cint)
if data_type == READSTAT_TYPE_STRING
return String
elseif data_type == READSTAT_TYPE_CHAR
Expand All @@ -132,18 +130,18 @@ function get_type(variable::Ptr{Nothing})
end
return Nothing
end
get_type(variable::Ptr{Nothing}) = get_type(readstat_variable_get_type(variable))

get_storagewidth(variable::Ptr{Nothing}) = readstat_variable_get_storage_width(variable)

get_measure(variable::Ptr{Nothing}) = readstat_variable_get_measure(variable)

get_alignment(variable::Ptr{Nothing}) = readstat_variable_get_measure(variable)

function handle_variable!(var_index::Cint, variable::Ptr{Nothing},
function handle_variable!(var_index::Cint, variable::Ptr{Nothing},
val_label::Cstring, ds_ptr::Ptr{ReadStatDataFrame})
col = var_index + 1
ds = unsafe_pointer_to_objref(ds_ptr)::ReadStatDataFrame

missing_count = readstat_variable_get_missing_ranges_count(variable)

push!(ds.val_label_keys, (val_label == C_NULL ? "" : unsafe_string(val_label)))
Expand All @@ -154,11 +152,16 @@ function handle_variable!(var_index::Cint, variable::Ptr{Nothing},
push!(ds.types, jtype)
push!(ds.types_as_int, readstat_variable_get_type(variable))
push!(ds.hasmissings, missing_count > 0)
push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, ds.rows), fill(false, ds.rows)))
# SAS XPORT sets ds.rows == -1
if ds.rows >= 0
push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, ds.rows), fill(false, ds.rows)))
else
push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, 0), fill(false, 0)))
end
push!(ds.storagewidths, get_storagewidth(variable))
push!(ds.measures, get_measure(variable))
push!(ds.alignments, get_alignment(variable))

return Cint(0)
end

Expand Down Expand Up @@ -191,100 +194,61 @@ function handle_value!(obs_index::Cint, variable::Ptr{Nothing},
readstat_value_is_missing(value, C_NULL)
end

if type_as_int==READSTAT_TYPE_DOUBLE
col_float64 = data[var_index]::DataValueVector{Float64}

if ismissing
DataValues.unsafe_setindex_isna!(col_float64, true, obs_index + 1)
else
readfield!(col_float64, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_INT32
col_int32 = data[var_index]::DataValueVector{Int32}

if ismissing
DataValues.unsafe_setindex_isna!(col_int32, true, obs_index + 1)
else
readfield!(col_int32, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_STRING
col_string = data[var_index]::DataValueVector{String}

if ismissing
DataValues.unsafe_setindex_isna!(col_string, true, obs_index + 1)
else
readfield!(col_string, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_CHAR
col_int8 = data[var_index]::DataValueVector{Int8}

if ismissing
DataValues.unsafe_setindex_isna!(col_int8, true, obs_index + 1)
else
readfield!(col_int8, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_INT16
col_int16 = data[var_index]::DataValueVector{Int16}

if ismissing
DataValues.unsafe_setindex_isna!(col_int16, true, obs_index + 1)
else
readfield!(col_int16, obs_index + 1, value)
end
elseif type_as_int==READSTAT_TYPE_FLOAT
col_float32 = data[var_index]::DataValueVector{Float32}

if ismissing
DataValues.unsafe_setindex_isna!(col_float32, true, obs_index + 1)
else
readfield!(col_float32, obs_index + 1, value)
end
else
col_untyped = data[var_index]
col = data[var_index]
@assert eltype(eltype(col)) == get_type(type_as_int)

if ismissing
DataValues.unsafe_setindex_isna!(col_untyped, true, obs_index + 1)
if ismissing
if obs_index < length(col)
DataValues.unsafe_setindex_isna!(col, true, obs_index + 1)
else
readfield!(col_untyped, obs_index + 1, value)
push!(col, DataValues.NA)
end
else
readfield!(col, obs_index + 1, value)
end

return Cint(0)
end

function readfield!(dest::DataValueVector{String}, row, val::ReadStatValue)
ptr = ccall((:readstat_string_value, libreadstat), Cstring, (ReadStatValue,), val)
if ptr ≠ C_NULL
@inbounds DataValues.unsafe_setindex_value!(dest, unsafe_string(ptr), row)
end
end

function readfield!(dest::DataValueVector{Int8}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int8_value, libreadstat), Int8, (ReadStatValue,), val), row)
end

function readfield!(dest::DataValueVector{Int16}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int16_value, libreadstat), Int16, (ReadStatValue,), val), row)
end

function readfield!(dest::DataValueVector{Int32}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int32_value, libreadstat), Int32, (ReadStatValue,), val), row)
end

function readfield!(dest::DataValueVector{Float64}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_double_value, libreadstat), Float64, (ReadStatValue,), val), row)
if row <= length(dest)
if ptr ≠ C_NULL
@inbounds DataValues.unsafe_setindex_value!(dest, unsafe_string(ptr), row)
end
elseif row == length(dest) + 1
_val = ptr ≠ C_NULL ? unsafe_string(ptr) : ""
DataValues.push!(dest, _val)
else
throw(ArgumentError("illegal row index: $row"))
end
end

function readfield!(dest::DataValueVector{Float32}, row, val::ReadStatValue)
@inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_float_value, libreadstat), Float32, (ReadStatValue,), val), row)
for (j_type, rs_name) in (
(Int8, :readstat_int8_value),
(Int16, :readstat_int16_value),
(Int32, :readstat_int32_value),
(Float32, :readstat_float_value),
(Float64, :readstat_double_value))
@eval function readfield!(dest::DataValueVector{$j_type}, row, val::ReadStatValue)
_val = ccall(($(QuoteNode(rs_name)), libreadstat), $j_type, (ReadStatValue,), val)
if row <= length(dest)
@inbounds DataValues.unsafe_setindex_value!(dest, _val, row)
elseif row == length(dest) + 1
DataValues.push!(dest, _val)
else
throw(ArgumentError("illegal row index: $row"))
end
end
end

function handle_value_label!(val_labels::Cstring, value::Value, label::Cstring, ds_ptr::Ptr{ReadStatDataFrame})
val_labels ≠ C_NULL || return Cint(0)
ds = unsafe_pointer_to_objref(ds_ptr)
dict = get!(ds.val_label_dict, unsafe_string(val_labels), Dict{Any,String}())
dict[as_native(value)] = unsafe_string(label)

return Cint(0)
end

Expand All @@ -311,7 +275,7 @@ function Parser()
ccall((:readstat_set_value_handler, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), parser, val_fxn)
ccall((:readstat_set_value_label_handler, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), parser, label_fxn)
return parser
end
end

function parse_data_file!(ds::ReadStatDataFrame, parser::Ptr{Nothing}, filename::AbstractString, filetype::Val)
retval = readstat_parse(filename, filetype, parser, ds)
Expand All @@ -323,5 +287,6 @@ read_dta(filename::AbstractString) = read_data_file(filename, Val(:dta))
read_sav(filename::AbstractString) = read_data_file(filename, Val(:sav))
read_por(filename::AbstractString) = read_data_file(filename, Val(:por))
read_sas7bdat(filename::AbstractString) = read_data_file(filename, Val(:sas7bdat))
read_xport(filename::AbstractString) = read_data_file(filename, Val(:xport))

end #module ReadStat
68 changes: 18 additions & 50 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,22 @@ using ReadStat
using DataValues
using Test

@testset "ReadStat" begin

@testset "DTA files" begin

dtafile = joinpath(dirname(@__FILE__), "types.dta")
rsdf = read_dta(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "SAV files" begin

dtafile = joinpath(dirname(@__FILE__), "types.sav")
rsdf = read_sav(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "SAS7BDAT files" begin

dtafile = joinpath(dirname(@__FILE__), "types.sas7bdat")
rsdf = read_sas7bdat(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "ReadStat: $ext files" for (reader, ext) in
((read_dta, "dta"),
(read_sav, "sav"),
(read_sas7bdat, "sas7bdat"),
(read_xport, "xpt"))

dtafile = joinpath(dirname(@__FILE__), "types.$ext")
rsdf = reader(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end
Binary file added test/types.xpt
Binary file not shown.