diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index a885136..b34076a 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -706,83 +706,97 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2) Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...) Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...) -# vcat only accepts DataTables. Finds union of columns, maintaining order -# of first dt. Missing data become null values. - -Base.vcat(dt::AbstractDataTable) = dt - -Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...]) - -function Base.vcat{T<:AbstractDataTable}(dts::Vector{T}) - isempty(dts) && return DataTable() - coltyps, colnams, similars = _colinfo(dts) - - res = DataTable() - Nrow = sum(nrow, dts) - for j in 1:length(colnams) - colnam = colnams[j] - col = similar(similars[j], coltyps[j], Nrow) - - i = 1 - for dt in dts - if haskey(dt, colnam) - copy!(col, i, dt[colnam]) - end - i += size(dt, 1) +@generated function promote_col_type(cols::AbstractVector...) + elty = Base.promote_eltype(cols...) + if elty <: Nullable + elty = eltype(elty) + end + if elty <: CategoricalValue + elty = elty.parameters[1] + end + if any(col -> eltype(col) <: Nullable, cols) + if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols) + return :(NullableCategoricalVector{$elty}) + else + return :(NullableVector{$elty}) + end + else + if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols) + return :(CategoricalVector{$elty}) + else + return :(Vector{$elty}) end - - res[colnam] = col end - res end -_isnullable{T}(::AbstractArray{T}) = T <: Nullable -const EMPTY_DATA = NullableArray(Void, 0) - -function _colinfo{T<:AbstractDataTable}(dts::Vector{T}) - dt1 = dts[1] - colindex = copy(index(dt1)) - coltyps = eltypes(dt1) - similars = collect(columns(dt1)) - nonnull_ct = Int[_isnullable(c) for c in columns(dt1)] - - for i in 2:length(dts) - dt = dts[i] - for j in 1:size(dt, 2) - col = dt[j] - cn, ct = _names(dt)[j], eltype(col) - if haskey(colindex, cn) - idx = colindex[cn] - - oldtyp = coltyps[idx] - if !(ct <: oldtyp) - coltyps[idx] = promote_type(oldtyp, ct) - # Needed on Julia 0.4 since e.g. - # promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}, - # which is not a usable type: fall back to Nullable{Any} - if VERSION < v"0.5.0-dev" && - coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType) - coltyps[idx] = Nullable{Any} - end - end - nonnull_ct[idx] += !_isnullable(col) - else # new column - push!(colindex, cn) - push!(coltyps, ct) - push!(similars, col) - push!(nonnull_ct, !_isnullable(col)) +""" + vcat(dts::AbstractDataTable...) + +Vertically concatenate `AbstractDataTables` that have the same column names in +the same order. + +# Example +```jldoctest +julia> dt1 = DataTable(A=1:3, B=1:3); +julia> dt2 = DataTable(A=4:6, B=4:6); +julia> vcat(dt1, dt2) +6×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ +│ 4 │ 4 │ 4 │ +│ 5 │ 5 │ 5 │ +│ 6 │ 6 │ 6 │ +``` +""" +Base.vcat(dt::AbstractDataTable) = dt +function Base.vcat(dts::AbstractDataTable...) + isempty(dts) && return DataTable() + allheaders = map(names, dts) + if all(h -> length(h) == 0, allheaders) + return DataTable() + end + uniqueheaders = unique(allheaders) + if length(uniqueheaders) > 1 + unionunique = union(uniqueheaders...) + coldiff = setdiff(unionunique, intersect(uniqueheaders...)) + if !isempty(coldiff) + # if any datatables are a full superset of names, skip them + filter!(u -> Set(u) != Set(unionunique), uniqueheaders) + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + matching = find(h -> u == h, allheaders) + headerdiff = setdiff(coldiff, u) + cols = join(headerdiff, ", ", " and ") + args = join(matching, ", ", " and ") + estrings[i] = "column(s) $cols are missing from argument(s) $args" end + throw(ArgumentError(join(estrings, ", ", ", and "))) + else + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + indices = find(a -> a == u, allheaders) + estrings[i] = "column order of argument(s) $(join(indices, ", ", " and "))" + end + throw(ArgumentError(join(estrings, " != "))) end - end - - for j in 1:length(colindex) - if nonnull_ct[j] < length(dts) && !_isnullable(similars[j]) - similars[j] = EMPTY_DATA + else + header = uniqueheaders[1] + cols = Vector{Any}(length(header)) + for i in 1:length(cols) + data = [dt[i] for dt in dts] + lens = map(length, data) + cols[i] = promote_col_type(data...)(sum(lens)) + offset = 1 + for j in 1:length(data) + copy!(cols[i], offset, data[j]) + offset += lens[j] + end end + return DataTable(cols, header) end - colnams = _names(colindex) - - coltyps, colnams, similars end ############################################################################## diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl index 711bcd3..2472976 100644 --- a/src/groupeddatatable/grouping.jl +++ b/src/groupeddatatable/grouping.jl @@ -193,7 +193,7 @@ combine(map(d -> mean(dropnull(d[:c])), gd)) """ function combine(ga::GroupApplied) gd, vals = ga.gd, ga.vals - valscat = vcat(vals) + valscat = vcat(vals...) idx = Vector{Int}(size(valscat, 1)) j = 0 @inbounds for (start, val) in zip(gd.starts, vals) diff --git a/test/cat.jl b/test/cat.jl index ab4e2ab..fd3f7e2 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -72,14 +72,14 @@ module TestCat dt[1:2, 1:2] = [3,2] dt[[true,false,false,true], 2:3] = [2,3] - vcat([]) - vcat(null_dt) - vcat(null_dt, null_dt) - vcat(null_dt, dt) - vcat(dt, null_dt) - vcat(dt, dt) - vcat(dt, dt, dt) - @test vcat(DataTable[]) == DataTable() + @test vcat(null_dt) == DataTable() + @test vcat(null_dt, null_dt) == DataTable() + @test_throws ArgumentError vcat(null_dt, dt) + @test_throws ArgumentError vcat(dt, null_dt) + @test eltypes(vcat(dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}] + @test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2)) + @test eltypes(vcat(dt, dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}] + @test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2)) alt_dt = deepcopy(dt) vcat(dt, alt_dt) @@ -88,29 +88,13 @@ module TestCat dt[1] = zeros(Int, nrow(dt)) vcat(dt, alt_dt) - # Don't fail on non-matching names - names!(alt_dt, [:A, :B, :C]) - vcat(dt, alt_dt) - dtr = vcat(dt4, dt4) @test size(dtr, 1) == 8 @test names(dt4) == names(dtr) @test isequal(dtr, [dt4; dt4]) - dtr = vcat(dt2, dt3) - @test size(dtr) == (8,2) - @test names(dt2) == names(dtr) - @test isnull(dtr[8,:x2]) - - # Eltype promotion - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] - else - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Any}] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}] - end + @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}] + @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] # Minimal container type promotion dta = DataTable(a = CategoricalArray([1, 2, 2])) @@ -122,12 +106,7 @@ module TestCat @test isequal(dtab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) @test isequal(dtac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) @test isa(dtab[:a], NullableCategoricalVector{Int}) - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test isa(dtac[:a], NullableCategoricalVector{Int}) - else - @test isa(dtac[:a], NullableCategoricalVector{Any}) - end + @test isa(dtac[:a], NullableCategoricalVector{Int}) # ^^ container may flip if container promotion happens in Base/DataArrays dc = vcat(dtd, dtc) @test isequal(vcat(dtc, dtd), dc) @@ -137,15 +116,120 @@ module TestCat @test isequal(vcat(dtd, dtc0, dtc), dc) @test eltypes(vcat(dtd, dtc0)) == eltypes(dc) - # Missing columns - rename!(dtd, :a, :b) - dtda = DataTable(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]), - a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2])) - @test isequal(vcat(dtd, dta), dtda) - - # Alignment - @test isequal(vcat(dtda, dtd, dta), vcat(dtda, dtda)) - # vcat should be able to concatenate different implementations of AbstractDataTable (PR #944) @test isequal(vcat(view(DataTable(A=1:3),2),DataTable(A=4:5)), DataTable(A=[2,4,5])) + + @testset "vcat >2 args" begin + @test vcat(DataTable(), DataTable(), DataTable()) == DataTable() + dt = DataTable(x = trues(1), y = falses(1)) + @test vcat(dt, dt, dt) == DataTable(x = trues(3), y = falses(3)) + end + + @testset "vcat mixed coltypes" begin + drf = CategoricalArrays.DefaultRefType + dt = vcat(DataTable([[1]], [:x]), DataTable([[1.0]], [:x])) + @test dt == DataTable([[1.0, 1.0]], [:x]) + @test typeof.(dt.columns) == [Vector{Float64}] + dt = vcat(DataTable([[1]], [:x]), DataTable([["1"]], [:x])) + @test dt == DataTable([[1, "1"]], [:x]) + @test typeof.(dt.columns) == [Vector{Any}] + dt = vcat(DataTable([NullableArray([1])], [:x]), DataTable([[1]], [:x])) + @test dt == DataTable([NullableArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [NullableVector{Int}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), DataTable([[1]], [:x])) + @test dt == DataTable([CategoricalArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [CategoricalVector{Int, drf}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), + DataTable([NullableArray([1])], [:x])) + @test dt == DataTable([NullableCategoricalArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), + DataTable([NullableCategoricalArray([1])], [:x])) + @test dt == DataTable([NullableCategoricalArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}] + dt = vcat(DataTable([NullableArray([1])], [:x]), + DataTable([NullableArray(["1"])], [:x])) + @test dt == DataTable([NullableArray([1, "1"])], [:x]) + @test typeof.(dt.columns) == [NullableVector{Any}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), + DataTable([CategoricalArray(["1"])], [:x])) + @test dt == DataTable([CategoricalArray([1, "1"])], [:x]) + @test typeof.(dt.columns) == [CategoricalVector{Any, drf}] + dt = vcat(DataTable([trues(1)], [:x]), DataTable([[false]], [:x])) + @test dt == DataTable([[true, false]], [:x]) + @test typeof.(dt.columns) == [Vector{Bool}] + end + + @testset "vcat errors" begin + err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[])) + @test err.value.msg == "column(s) x are missing from argument(s) 1 and 2" + err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[1])) + @test err.value.msg == "column(s) x are missing from argument(s) 1 and 2" + dt1 = DataTable(A = 1:3, B = 1:3) + dt2 = DataTable(A = 1:3) + # right missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # left missing 1 column + err = @test_throws ArgumentError vcat(dt2, dt1) + @test err.value.msg == "column(s) B are missing from argument(s) 1" + # multiple missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" + # argument missing >1 columns + dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2" + # >1 arguments missing >1 columns + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5" + # out of order + dt2 = dt1[reverse(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2" + # first group >1 arguments + err = @test_throws ArgumentError vcat(dt1, dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3" + # second group >1 arguments + err = @test_throws ArgumentError vcat(dt1, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3" + # first and second groups >1 argument + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6" + # >2 groups out of order + srand(1) + dt3 = dt1[shuffle(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10" + # missing columns throws error before out of order columns + dt1 = DataTable(A = 1, B = 1) + dt2 = DataTable(A = 1) + dt3 = DataTable(B = 1, A = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # unique columns for both sides + dt1 = DataTable(A = 1, B = 1, C = 1, D = 1) + dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" + dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + # dt4 is a superset of names found in all other datatables and won't be shown in error + dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" + end end diff --git a/test/grouping.jl b/test/grouping.jl index 4ce63a8..19a7c16 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -146,7 +146,7 @@ module TestGrouping groupby(dt, [:v1, :v2]) dt2 = by(e->1, DataTable(x=Int64[]), :x) - @test size(dt2) == (0,1) + @test size(dt2) == (0,2) @test isequal(sum(dt2[:x]), Nullable(0)) # Check that reordering levels does not confuse groupby