From dd9c3efc4939b75af39f1a7b9c6938602b71d665 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 3 Apr 2017 16:59:12 -0700 Subject: [PATCH 1/2] Update combine to use splat `...` style vcat, like in Base.vcat This change is helpful to support changing vcat to be more consistent with Base.vcat, where passing an array is not allowed. --- src/groupeddatatable/grouping.jl | 2 +- test/grouping.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl index 711bcd3..2472976 100644 --- a/src/groupeddatatable/grouping.jl +++ b/src/groupeddatatable/grouping.jl @@ -193,7 +193,7 @@ combine(map(d -> mean(dropnull(d[:c])), gd)) """ function combine(ga::GroupApplied) gd, vals = ga.gd, ga.vals - valscat = vcat(vals) + valscat = vcat(vals...) idx = Vector{Int}(size(valscat, 1)) j = 0 @inbounds for (start, val) in zip(gd.starts, vals) diff --git a/test/grouping.jl b/test/grouping.jl index 4ce63a8..19a7c16 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -146,7 +146,7 @@ module TestGrouping groupby(dt, [:v1, :v2]) dt2 = by(e->1, DataTable(x=Int64[]), :x) - @test size(dt2) == (0,1) + @test size(dt2) == (0,2) @test isequal(sum(dt2[:x]), Nullable(0)) # Check that reordering levels does not confuse groupby From 0478c37d8650897801982dad0bceab84591bdfcf Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 3 Apr 2017 18:49:51 -0700 Subject: [PATCH 2/2] Update vcat for consistency w/ Base.vcat and improve array promotion This PR removes vcat support for arrays of datatables and makes the Base.vcat style of vcat(args...) the only call option. Removes assumptions for joining datatables with missing, unique, and out of order columns. vcat'ing datatables with unmatched headers results in error messages that explain how the columns are inconsistent. Uses @nalimilan's @generated function to implement a new type of AbstractArray promotion rule that improves handling of NullableArrays and CategoricalArrays. Extends vcat testing. --- src/abstractdatatable/abstractdatatable.jl | 152 ++++++++++--------- test/cat.jl | 166 ++++++++++++++++----- 2 files changed, 208 insertions(+), 110 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index a885136..b34076a 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -706,83 +706,97 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2) Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...) Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...) -# vcat only accepts DataTables. Finds union of columns, maintaining order -# of first dt. Missing data become null values. - -Base.vcat(dt::AbstractDataTable) = dt - -Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...]) - -function Base.vcat{T<:AbstractDataTable}(dts::Vector{T}) - isempty(dts) && return DataTable() - coltyps, colnams, similars = _colinfo(dts) - - res = DataTable() - Nrow = sum(nrow, dts) - for j in 1:length(colnams) - colnam = colnams[j] - col = similar(similars[j], coltyps[j], Nrow) - - i = 1 - for dt in dts - if haskey(dt, colnam) - copy!(col, i, dt[colnam]) - end - i += size(dt, 1) +@generated function promote_col_type(cols::AbstractVector...) + elty = Base.promote_eltype(cols...) + if elty <: Nullable + elty = eltype(elty) + end + if elty <: CategoricalValue + elty = elty.parameters[1] + end + if any(col -> eltype(col) <: Nullable, cols) + if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols) + return :(NullableCategoricalVector{$elty}) + else + return :(NullableVector{$elty}) + end + else + if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols) + return :(CategoricalVector{$elty}) + else + return :(Vector{$elty}) end - - res[colnam] = col end - res end -_isnullable{T}(::AbstractArray{T}) = T <: Nullable -const EMPTY_DATA = NullableArray(Void, 0) - -function _colinfo{T<:AbstractDataTable}(dts::Vector{T}) - dt1 = dts[1] - colindex = copy(index(dt1)) - coltyps = eltypes(dt1) - similars = collect(columns(dt1)) - nonnull_ct = Int[_isnullable(c) for c in columns(dt1)] - - for i in 2:length(dts) - dt = dts[i] - for j in 1:size(dt, 2) - col = dt[j] - cn, ct = _names(dt)[j], eltype(col) - if haskey(colindex, cn) - idx = colindex[cn] - - oldtyp = coltyps[idx] - if !(ct <: oldtyp) - coltyps[idx] = promote_type(oldtyp, ct) - # Needed on Julia 0.4 since e.g. - # promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}, - # which is not a usable type: fall back to Nullable{Any} - if VERSION < v"0.5.0-dev" && - coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType) - coltyps[idx] = Nullable{Any} - end - end - nonnull_ct[idx] += !_isnullable(col) - else # new column - push!(colindex, cn) - push!(coltyps, ct) - push!(similars, col) - push!(nonnull_ct, !_isnullable(col)) +""" + vcat(dts::AbstractDataTable...) + +Vertically concatenate `AbstractDataTables` that have the same column names in +the same order. + +# Example +```jldoctest +julia> dt1 = DataTable(A=1:3, B=1:3); +julia> dt2 = DataTable(A=4:6, B=4:6); +julia> vcat(dt1, dt2) +6×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ +│ 4 │ 4 │ 4 │ +│ 5 │ 5 │ 5 │ +│ 6 │ 6 │ 6 │ +``` +""" +Base.vcat(dt::AbstractDataTable) = dt +function Base.vcat(dts::AbstractDataTable...) + isempty(dts) && return DataTable() + allheaders = map(names, dts) + if all(h -> length(h) == 0, allheaders) + return DataTable() + end + uniqueheaders = unique(allheaders) + if length(uniqueheaders) > 1 + unionunique = union(uniqueheaders...) + coldiff = setdiff(unionunique, intersect(uniqueheaders...)) + if !isempty(coldiff) + # if any datatables are a full superset of names, skip them + filter!(u -> Set(u) != Set(unionunique), uniqueheaders) + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + matching = find(h -> u == h, allheaders) + headerdiff = setdiff(coldiff, u) + cols = join(headerdiff, ", ", " and ") + args = join(matching, ", ", " and ") + estrings[i] = "column(s) $cols are missing from argument(s) $args" end + throw(ArgumentError(join(estrings, ", ", ", and "))) + else + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + indices = find(a -> a == u, allheaders) + estrings[i] = "column order of argument(s) $(join(indices, ", ", " and "))" + end + throw(ArgumentError(join(estrings, " != "))) end - end - - for j in 1:length(colindex) - if nonnull_ct[j] < length(dts) && !_isnullable(similars[j]) - similars[j] = EMPTY_DATA + else + header = uniqueheaders[1] + cols = Vector{Any}(length(header)) + for i in 1:length(cols) + data = [dt[i] for dt in dts] + lens = map(length, data) + cols[i] = promote_col_type(data...)(sum(lens)) + offset = 1 + for j in 1:length(data) + copy!(cols[i], offset, data[j]) + offset += lens[j] + end end + return DataTable(cols, header) end - colnams = _names(colindex) - - coltyps, colnams, similars end ############################################################################## diff --git a/test/cat.jl b/test/cat.jl index ab4e2ab..fd3f7e2 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -72,14 +72,14 @@ module TestCat dt[1:2, 1:2] = [3,2] dt[[true,false,false,true], 2:3] = [2,3] - vcat([]) - vcat(null_dt) - vcat(null_dt, null_dt) - vcat(null_dt, dt) - vcat(dt, null_dt) - vcat(dt, dt) - vcat(dt, dt, dt) - @test vcat(DataTable[]) == DataTable() + @test vcat(null_dt) == DataTable() + @test vcat(null_dt, null_dt) == DataTable() + @test_throws ArgumentError vcat(null_dt, dt) + @test_throws ArgumentError vcat(dt, null_dt) + @test eltypes(vcat(dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}] + @test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2)) + @test eltypes(vcat(dt, dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}] + @test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2)) alt_dt = deepcopy(dt) vcat(dt, alt_dt) @@ -88,29 +88,13 @@ module TestCat dt[1] = zeros(Int, nrow(dt)) vcat(dt, alt_dt) - # Don't fail on non-matching names - names!(alt_dt, [:A, :B, :C]) - vcat(dt, alt_dt) - dtr = vcat(dt4, dt4) @test size(dtr, 1) == 8 @test names(dt4) == names(dtr) @test isequal(dtr, [dt4; dt4]) - dtr = vcat(dt2, dt3) - @test size(dtr) == (8,2) - @test names(dt2) == names(dtr) - @test isnull(dtr[8,:x2]) - - # Eltype promotion - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] - else - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Any}] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}] - end + @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}] + @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] # Minimal container type promotion dta = DataTable(a = CategoricalArray([1, 2, 2])) @@ -122,12 +106,7 @@ module TestCat @test isequal(dtab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) @test isequal(dtac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) @test isa(dtab[:a], NullableCategoricalVector{Int}) - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test isa(dtac[:a], NullableCategoricalVector{Int}) - else - @test isa(dtac[:a], NullableCategoricalVector{Any}) - end + @test isa(dtac[:a], NullableCategoricalVector{Int}) # ^^ container may flip if container promotion happens in Base/DataArrays dc = vcat(dtd, dtc) @test isequal(vcat(dtc, dtd), dc) @@ -137,15 +116,120 @@ module TestCat @test isequal(vcat(dtd, dtc0, dtc), dc) @test eltypes(vcat(dtd, dtc0)) == eltypes(dc) - # Missing columns - rename!(dtd, :a, :b) - dtda = DataTable(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]), - a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2])) - @test isequal(vcat(dtd, dta), dtda) - - # Alignment - @test isequal(vcat(dtda, dtd, dta), vcat(dtda, dtda)) - # vcat should be able to concatenate different implementations of AbstractDataTable (PR #944) @test isequal(vcat(view(DataTable(A=1:3),2),DataTable(A=4:5)), DataTable(A=[2,4,5])) + + @testset "vcat >2 args" begin + @test vcat(DataTable(), DataTable(), DataTable()) == DataTable() + dt = DataTable(x = trues(1), y = falses(1)) + @test vcat(dt, dt, dt) == DataTable(x = trues(3), y = falses(3)) + end + + @testset "vcat mixed coltypes" begin + drf = CategoricalArrays.DefaultRefType + dt = vcat(DataTable([[1]], [:x]), DataTable([[1.0]], [:x])) + @test dt == DataTable([[1.0, 1.0]], [:x]) + @test typeof.(dt.columns) == [Vector{Float64}] + dt = vcat(DataTable([[1]], [:x]), DataTable([["1"]], [:x])) + @test dt == DataTable([[1, "1"]], [:x]) + @test typeof.(dt.columns) == [Vector{Any}] + dt = vcat(DataTable([NullableArray([1])], [:x]), DataTable([[1]], [:x])) + @test dt == DataTable([NullableArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [NullableVector{Int}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), DataTable([[1]], [:x])) + @test dt == DataTable([CategoricalArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [CategoricalVector{Int, drf}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), + DataTable([NullableArray([1])], [:x])) + @test dt == DataTable([NullableCategoricalArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), + DataTable([NullableCategoricalArray([1])], [:x])) + @test dt == DataTable([NullableCategoricalArray([1, 1])], [:x]) + @test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}] + dt = vcat(DataTable([NullableArray([1])], [:x]), + DataTable([NullableArray(["1"])], [:x])) + @test dt == DataTable([NullableArray([1, "1"])], [:x]) + @test typeof.(dt.columns) == [NullableVector{Any}] + dt = vcat(DataTable([CategoricalArray([1])], [:x]), + DataTable([CategoricalArray(["1"])], [:x])) + @test dt == DataTable([CategoricalArray([1, "1"])], [:x]) + @test typeof.(dt.columns) == [CategoricalVector{Any, drf}] + dt = vcat(DataTable([trues(1)], [:x]), DataTable([[false]], [:x])) + @test dt == DataTable([[true, false]], [:x]) + @test typeof.(dt.columns) == [Vector{Bool}] + end + + @testset "vcat errors" begin + err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[])) + @test err.value.msg == "column(s) x are missing from argument(s) 1 and 2" + err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[1])) + @test err.value.msg == "column(s) x are missing from argument(s) 1 and 2" + dt1 = DataTable(A = 1:3, B = 1:3) + dt2 = DataTable(A = 1:3) + # right missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # left missing 1 column + err = @test_throws ArgumentError vcat(dt2, dt1) + @test err.value.msg == "column(s) B are missing from argument(s) 1" + # multiple missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" + # argument missing >1 columns + dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2" + # >1 arguments missing >1 columns + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5" + # out of order + dt2 = dt1[reverse(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2" + # first group >1 arguments + err = @test_throws ArgumentError vcat(dt1, dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3" + # second group >1 arguments + err = @test_throws ArgumentError vcat(dt1, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3" + # first and second groups >1 argument + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6" + # >2 groups out of order + srand(1) + dt3 = dt1[shuffle(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10" + # missing columns throws error before out of order columns + dt1 = DataTable(A = 1, B = 1) + dt2 = DataTable(A = 1) + dt3 = DataTable(B = 1, A = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # unique columns for both sides + dt1 = DataTable(A = 1, B = 1, C = 1, D = 1) + dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" + dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + # dt4 is a superset of names found in all other datatables and won't be shown in error + dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" + end end