diff --git a/src/C_interface.jl b/src/C_interface.jl index 4509f98..4880bcd 100644 --- a/src/C_interface.jl +++ b/src/C_interface.jl @@ -8,19 +8,19 @@ function readstat_get_modified_time(metadata::Ptr{Nothing}) end function readstat_get_file_format_version(metadata::Ptr{Nothing}) - return ccall((:readstat_get_file_format_version, libreadstat), UInt, (Ptr{Nothing},), metadata) + return ccall((:readstat_get_file_format_version, libreadstat), Cint, (Ptr{Nothing},), metadata) end function readstat_get_row_count(metadata::Ptr{Nothing}) - return ccall((:readstat_get_row_count, libreadstat), UInt, (Ptr{Nothing},), metadata) + return ccall((:readstat_get_row_count, libreadstat), Cint, (Ptr{Nothing},), metadata) end function readstat_get_var_count(metadata::Ptr{Nothing}) - return ccall((:readstat_get_var_count, libreadstat), UInt, (Ptr{Nothing},), metadata) + return ccall((:readstat_get_var_count, libreadstat), Cint, (Ptr{Nothing},), metadata) end function readstat_value_is_missing(value::ReadStatValue, variable::Ptr{Nothing}) - return ccall((:readstat_value_is_missing, libreadstat), Bool, (ReadStatValue,Ptr{Nothing}), value, variable) + return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue,Ptr{Nothing}), value, variable)) end function readstat_variable_get_index(variable::Ptr{Nothing}) @@ -56,19 +56,23 @@ function readstat_value_type(val::Value) end function readstat_parse(filename::String, type::Val{:dta}, parser::Ptr{Nothing}, ds::ReadStatDataFrame) - return ccall((:readstat_parse_dta, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) + return ccall((:readstat_parse_dta, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) end function readstat_parse(filename::String, type::Val{:sav}, parser::Ptr{Nothing}, ds::ReadStatDataFrame) - return ccall((:readstat_parse_sav, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) + return ccall((:readstat_parse_sav, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) end function readstat_parse(filename::String, type::Val{:por}, parser::Ptr{Nothing}, ds::ReadStatDataFrame) - return ccall((:readstat_parse_por, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) + return ccall((:readstat_parse_por, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) end function readstat_parse(filename::String, type::Val{:sas7bdat}, parser::Ptr{Nothing}, ds::ReadStatDataFrame) - return ccall((:readstat_parse_sas7bdat, libreadstat), Int, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) + return ccall((:readstat_parse_sas7bdat, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) +end + +function readstat_parse(filename::String, type::Val{:xport}, parser::Ptr{Nothing}, ds::ReadStatDataFrame) + return ccall((:readstat_parse_xport, libreadstat), Cint, (Ptr{Nothing}, Cstring, Any), parser, string(filename), ds) end function readstat_variable_get_missing_ranges_count(variable::Ptr{Nothing}) diff --git a/src/ReadStat.jl b/src/ReadStat.jl index eb4565f..6979f3a 100644 --- a/src/ReadStat.jl +++ b/src/ReadStat.jl @@ -12,7 +12,7 @@ using DataValues: DataValueVector import DataValues using Dates -export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat +export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport ############################################################################## ## @@ -72,7 +72,7 @@ mutable struct ReadStatDataFrame types_as_int::Vector{Cint} hasmissings::Vector{Bool} - ReadStatDataFrame() = + ReadStatDataFrame() = new(Any[], Symbol[], DataType[], String[], String[], Csize_t[], Cint[], Cint[], String[], Dict{String, Dict{Any,String}}(), 0, 0, "", Dates.unix2datetime(0), 0, Cint[], Bool[]) end @@ -114,9 +114,7 @@ function get_format(var::Ptr{Nothing}) ptr == C_NULL ? "" : unsafe_string(ptr) end -function get_type(variable::Ptr{Nothing}) - data_type = readstat_variable_get_type(variable) - +function get_type(data_type::Cint) if data_type == READSTAT_TYPE_STRING return String elseif data_type == READSTAT_TYPE_CHAR @@ -132,6 +130,7 @@ function get_type(variable::Ptr{Nothing}) end return Nothing end +get_type(variable::Ptr{Nothing}) = get_type(readstat_variable_get_type(variable)) get_storagewidth(variable::Ptr{Nothing}) = readstat_variable_get_storage_width(variable) @@ -139,11 +138,10 @@ get_measure(variable::Ptr{Nothing}) = readstat_variable_get_measure(variable) get_alignment(variable::Ptr{Nothing}) = readstat_variable_get_measure(variable) -function handle_variable!(var_index::Cint, variable::Ptr{Nothing}, +function handle_variable!(var_index::Cint, variable::Ptr{Nothing}, val_label::Cstring, ds_ptr::Ptr{ReadStatDataFrame}) col = var_index + 1 ds = unsafe_pointer_to_objref(ds_ptr)::ReadStatDataFrame - missing_count = readstat_variable_get_missing_ranges_count(variable) push!(ds.val_label_keys, (val_label == C_NULL ? "" : unsafe_string(val_label))) @@ -154,11 +152,16 @@ function handle_variable!(var_index::Cint, variable::Ptr{Nothing}, push!(ds.types, jtype) push!(ds.types_as_int, readstat_variable_get_type(variable)) push!(ds.hasmissings, missing_count > 0) - push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, ds.rows), fill(false, ds.rows))) + # SAS XPORT sets ds.rows == -1 + if ds.rows >= 0 + push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, ds.rows), fill(false, ds.rows))) + else + push!(ds.data, DataValueVector{jtype}(Vector{jtype}(undef, 0), fill(false, 0))) + end push!(ds.storagewidths, get_storagewidth(variable)) push!(ds.measures, get_measure(variable)) push!(ds.alignments, get_alignment(variable)) - + return Cint(0) end @@ -191,62 +194,17 @@ function handle_value!(obs_index::Cint, variable::Ptr{Nothing}, readstat_value_is_missing(value, C_NULL) end - if type_as_int==READSTAT_TYPE_DOUBLE - col_float64 = data[var_index]::DataValueVector{Float64} - - if ismissing - DataValues.unsafe_setindex_isna!(col_float64, true, obs_index + 1) - else - readfield!(col_float64, obs_index + 1, value) - end - elseif type_as_int==READSTAT_TYPE_INT32 - col_int32 = data[var_index]::DataValueVector{Int32} - - if ismissing - DataValues.unsafe_setindex_isna!(col_int32, true, obs_index + 1) - else - readfield!(col_int32, obs_index + 1, value) - end - elseif type_as_int==READSTAT_TYPE_STRING - col_string = data[var_index]::DataValueVector{String} - - if ismissing - DataValues.unsafe_setindex_isna!(col_string, true, obs_index + 1) - else - readfield!(col_string, obs_index + 1, value) - end - elseif type_as_int==READSTAT_TYPE_CHAR - col_int8 = data[var_index]::DataValueVector{Int8} - - if ismissing - DataValues.unsafe_setindex_isna!(col_int8, true, obs_index + 1) - else - readfield!(col_int8, obs_index + 1, value) - end - elseif type_as_int==READSTAT_TYPE_INT16 - col_int16 = data[var_index]::DataValueVector{Int16} - - if ismissing - DataValues.unsafe_setindex_isna!(col_int16, true, obs_index + 1) - else - readfield!(col_int16, obs_index + 1, value) - end - elseif type_as_int==READSTAT_TYPE_FLOAT - col_float32 = data[var_index]::DataValueVector{Float32} - - if ismissing - DataValues.unsafe_setindex_isna!(col_float32, true, obs_index + 1) - else - readfield!(col_float32, obs_index + 1, value) - end - else - col_untyped = data[var_index] + col = data[var_index] + @assert eltype(eltype(col)) == get_type(type_as_int) - if ismissing - DataValues.unsafe_setindex_isna!(col_untyped, true, obs_index + 1) + if ismissing + if obs_index < length(col) + DataValues.unsafe_setindex_isna!(col, true, obs_index + 1) else - readfield!(col_untyped, obs_index + 1, value) + push!(col, DataValues.NA) end + else + readfield!(col, obs_index + 1, value) end return Cint(0) @@ -254,29 +212,35 @@ end function readfield!(dest::DataValueVector{String}, row, val::ReadStatValue) ptr = ccall((:readstat_string_value, libreadstat), Cstring, (ReadStatValue,), val) - if ptr ≠ C_NULL - @inbounds DataValues.unsafe_setindex_value!(dest, unsafe_string(ptr), row) - end -end - -function readfield!(dest::DataValueVector{Int8}, row, val::ReadStatValue) - @inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int8_value, libreadstat), Int8, (ReadStatValue,), val), row) -end - -function readfield!(dest::DataValueVector{Int16}, row, val::ReadStatValue) - @inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int16_value, libreadstat), Int16, (ReadStatValue,), val), row) -end -function readfield!(dest::DataValueVector{Int32}, row, val::ReadStatValue) - @inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_int32_value, libreadstat), Int32, (ReadStatValue,), val), row) -end - -function readfield!(dest::DataValueVector{Float64}, row, val::ReadStatValue) - @inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_double_value, libreadstat), Float64, (ReadStatValue,), val), row) + if row <= length(dest) + if ptr ≠ C_NULL + @inbounds DataValues.unsafe_setindex_value!(dest, unsafe_string(ptr), row) + end + elseif row == length(dest) + 1 + _val = ptr ≠ C_NULL ? unsafe_string(ptr) : "" + DataValues.push!(dest, _val) + else + throw(ArgumentError("illegal row index: $row")) + end end -function readfield!(dest::DataValueVector{Float32}, row, val::ReadStatValue) - @inbounds DataValues.unsafe_setindex_value!(dest, ccall((:readstat_float_value, libreadstat), Float32, (ReadStatValue,), val), row) +for (j_type, rs_name) in ( + (Int8, :readstat_int8_value), + (Int16, :readstat_int16_value), + (Int32, :readstat_int32_value), + (Float32, :readstat_float_value), + (Float64, :readstat_double_value)) + @eval function readfield!(dest::DataValueVector{$j_type}, row, val::ReadStatValue) + _val = ccall(($(QuoteNode(rs_name)), libreadstat), $j_type, (ReadStatValue,), val) + if row <= length(dest) + @inbounds DataValues.unsafe_setindex_value!(dest, _val, row) + elseif row == length(dest) + 1 + DataValues.push!(dest, _val) + else + throw(ArgumentError("illegal row index: $row")) + end + end end function handle_value_label!(val_labels::Cstring, value::Value, label::Cstring, ds_ptr::Ptr{ReadStatDataFrame}) @@ -284,7 +248,7 @@ function handle_value_label!(val_labels::Cstring, value::Value, label::Cstring, ds = unsafe_pointer_to_objref(ds_ptr) dict = get!(ds.val_label_dict, unsafe_string(val_labels), Dict{Any,String}()) dict[as_native(value)] = unsafe_string(label) - + return Cint(0) end @@ -311,7 +275,7 @@ function Parser() ccall((:readstat_set_value_handler, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), parser, val_fxn) ccall((:readstat_set_value_label_handler, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), parser, label_fxn) return parser -end +end function parse_data_file!(ds::ReadStatDataFrame, parser::Ptr{Nothing}, filename::AbstractString, filetype::Val) retval = readstat_parse(filename, filetype, parser, ds) @@ -323,5 +287,6 @@ read_dta(filename::AbstractString) = read_data_file(filename, Val(:dta)) read_sav(filename::AbstractString) = read_data_file(filename, Val(:sav)) read_por(filename::AbstractString) = read_data_file(filename, Val(:por)) read_sas7bdat(filename::AbstractString) = read_data_file(filename, Val(:sas7bdat)) +read_xport(filename::AbstractString) = read_data_file(filename, Val(:xport)) end #module ReadStat diff --git a/test/runtests.jl b/test/runtests.jl index 97e6b81..dbc10b0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,54 +2,22 @@ using ReadStat using DataValues using Test -@testset "ReadStat" begin - -@testset "DTA files" begin - -dtafile = joinpath(dirname(@__FILE__), "types.dta") -rsdf = read_dta(dtafile) -data = rsdf.data - -@test length(data) == 6 -@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring] -@test data[1] == DataValueArray{Float32}([3.14, 7., NA]) -@test data[2] == DataValueArray{Float64}([3.14, 7., NA]) -@test data[3] == DataValueArray{Int32}([2, 7, NA]) -@test data[4] == DataValueArray{Int16}([2, 7, NA]) -@test data[5] == DataValueArray{Int8}([2, 7., NA]) -@test data[6] == DataValueArray{String}(["2", "7", ""]) -end - -@testset "SAV files" begin - -dtafile = joinpath(dirname(@__FILE__), "types.sav") -rsdf = read_sav(dtafile) -data = rsdf.data - -@test length(data) == 6 -@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring] -@test data[1] == DataValueArray{Float32}([3.14, 7., NA]) -@test data[2] == DataValueArray{Float64}([3.14, 7., NA]) -@test data[3] == DataValueArray{Int32}([2, 7, NA]) -@test data[4] == DataValueArray{Int16}([2, 7, NA]) -@test data[5] == DataValueArray{Int8}([2, 7., NA]) -@test data[6] == DataValueArray{String}(["2", "7", ""]) -end - -@testset "SAS7BDAT files" begin - -dtafile = joinpath(dirname(@__FILE__), "types.sas7bdat") -rsdf = read_sas7bdat(dtafile) -data = rsdf.data - -@test length(data) == 6 -@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring] -@test data[1] == DataValueArray{Float32}([3.14, 7., NA]) -@test data[2] == DataValueArray{Float64}([3.14, 7., NA]) -@test data[3] == DataValueArray{Int32}([2, 7, NA]) -@test data[4] == DataValueArray{Int16}([2, 7, NA]) -@test data[5] == DataValueArray{Int8}([2, 7., NA]) -@test data[6] == DataValueArray{String}(["2", "7", ""]) -end - +@testset "ReadStat: $ext files" for (reader, ext) in + ((read_dta, "dta"), + (read_sav, "sav"), + (read_sas7bdat, "sas7bdat"), + (read_xport, "xpt")) + + dtafile = joinpath(dirname(@__FILE__), "types.$ext") + rsdf = reader(dtafile) + data = rsdf.data + + @test length(data) == 6 + @test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring] + @test data[1] == DataValueArray{Float32}([3.14, 7., NA]) + @test data[2] == DataValueArray{Float64}([3.14, 7., NA]) + @test data[3] == DataValueArray{Int32}([2, 7, NA]) + @test data[4] == DataValueArray{Int16}([2, 7, NA]) + @test data[5] == DataValueArray{Int8}([2, 7., NA]) + @test data[6] == DataValueArray{String}(["2", "7", ""]) end diff --git a/test/types.xpt b/test/types.xpt new file mode 100644 index 0000000..be4cb7c Binary files /dev/null and b/test/types.xpt differ