Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

update vcat to mimic Base.vcat and enhance promotion rules of mixed column type #45

Merged
merged 2 commits into from
May 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 83 additions & 69 deletions src/abstractdatatable/abstractdatatable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -706,83 +706,97 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2)
Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...)
Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...)

# vcat only accepts DataTables. Finds union of columns, maintaining order
# of first dt. Missing data become null values.

Base.vcat(dt::AbstractDataTable) = dt

Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...])

function Base.vcat{T<:AbstractDataTable}(dts::Vector{T})
isempty(dts) && return DataTable()
coltyps, colnams, similars = _colinfo(dts)

res = DataTable()
Nrow = sum(nrow, dts)
for j in 1:length(colnams)
colnam = colnams[j]
col = similar(similars[j], coltyps[j], Nrow)

i = 1
for dt in dts
if haskey(dt, colnam)
copy!(col, i, dt[colnam])
end
i += size(dt, 1)
@generated function promote_col_type(cols::AbstractVector...)
elty = Base.promote_eltype(cols...)
if elty <: Nullable
elty = eltype(elty)
end
if elty <: CategoricalValue
elty = elty.parameters[1]
end
if any(col -> eltype(col) <: Nullable, cols)
if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols)
return :(NullableCategoricalVector{$elty})
else
return :(NullableVector{$elty})
end
else
if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols)
return :(CategoricalVector{$elty})
else
return :(Vector{$elty})
end

res[colnam] = col
end
res
end

_isnullable{T}(::AbstractArray{T}) = T <: Nullable
const EMPTY_DATA = NullableArray(Void, 0)

function _colinfo{T<:AbstractDataTable}(dts::Vector{T})
dt1 = dts[1]
colindex = copy(index(dt1))
coltyps = eltypes(dt1)
similars = collect(columns(dt1))
nonnull_ct = Int[_isnullable(c) for c in columns(dt1)]

for i in 2:length(dts)
dt = dts[i]
for j in 1:size(dt, 2)
col = dt[j]
cn, ct = _names(dt)[j], eltype(col)
if haskey(colindex, cn)
idx = colindex[cn]

oldtyp = coltyps[idx]
if !(ct <: oldtyp)
coltyps[idx] = promote_type(oldtyp, ct)
# Needed on Julia 0.4 since e.g.
# promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T},
# which is not a usable type: fall back to Nullable{Any}
if VERSION < v"0.5.0-dev" &&
coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType)
coltyps[idx] = Nullable{Any}
end
end
nonnull_ct[idx] += !_isnullable(col)
else # new column
push!(colindex, cn)
push!(coltyps, ct)
push!(similars, col)
push!(nonnull_ct, !_isnullable(col))
"""
vcat(dts::AbstractDataTable...)

Vertically concatenate `AbstractDataTables` that have the same column names in
the same order.

# Example
```jldoctest
julia> dt1 = DataTable(A=1:3, B=1:3);
julia> dt2 = DataTable(A=4:6, B=4:6);
julia> vcat(dt1, dt2)
6×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
│ 4 │ 4 │ 4 │
│ 5 │ 5 │ 5 │
│ 6 │ 6 │ 6 │
```
"""
Base.vcat(dt::AbstractDataTable) = dt
function Base.vcat(dts::AbstractDataTable...)
isempty(dts) && return DataTable()
allheaders = map(names, dts)
if all(h -> length(h) == 0, allheaders)
return DataTable()
end
uniqueheaders = unique(allheaders)
if length(uniqueheaders) > 1
unionunique = union(uniqueheaders...)
coldiff = setdiff(unionunique, intersect(uniqueheaders...))
if !isempty(coldiff)
# if any datatables are a full superset of names, skip them
filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
matching = find(h -> u == h, allheaders)
headerdiff = setdiff(coldiff, u)
cols = join(headerdiff, ", ", " and ")
args = join(matching, ", ", " and ")
estrings[i] = "column(s) $cols are missing from argument(s) $args"
end
throw(ArgumentError(join(estrings, ", ", ", and ")))
else
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
indices = find(a -> a == u, allheaders)
estrings[i] = "column order of argument(s) $(join(indices, ", ", " and "))"
end
throw(ArgumentError(join(estrings, " != ")))
end
end

for j in 1:length(colindex)
if nonnull_ct[j] < length(dts) && !_isnullable(similars[j])
similars[j] = EMPTY_DATA
else
header = uniqueheaders[1]
cols = Vector{Any}(length(header))
for i in 1:length(cols)
data = [dt[i] for dt in dts]
lens = map(length, data)
cols[i] = promote_col_type(data...)(sum(lens))
offset = 1
for j in 1:length(data)
copy!(cols[i], offset, data[j])
offset += lens[j]
end
end
return DataTable(cols, header)
end
colnams = _names(colindex)

coltyps, colnams, similars
end

##############################################################################
Expand Down
2 changes: 1 addition & 1 deletion src/groupeddatatable/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ combine(map(d -> mean(dropnull(d[:c])), gd))
"""
function combine(ga::GroupApplied)
gd, vals = ga.gd, ga.vals
valscat = vcat(vals)
valscat = vcat(vals...)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this go into a separate commit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Different commit but same PR? I can do that 👍

idx = Vector{Int}(size(valscat, 1))
j = 0
@inbounds for (start, val) in zip(gd.starts, vals)
Expand Down
166 changes: 125 additions & 41 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@ module TestCat
dt[1:2, 1:2] = [3,2]
dt[[true,false,false,true], 2:3] = [2,3]

vcat([])
vcat(null_dt)
vcat(null_dt, null_dt)
vcat(null_dt, dt)
vcat(dt, null_dt)
vcat(dt, dt)
vcat(dt, dt, dt)
@test vcat(DataTable[]) == DataTable()
@test vcat(null_dt) == DataTable()
@test vcat(null_dt, null_dt) == DataTable()
@test_throws ArgumentError vcat(null_dt, dt)
@test_throws ArgumentError vcat(dt, null_dt)
@test eltypes(vcat(dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}]
@test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2))
@test eltypes(vcat(dt, dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}]
@test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2))

alt_dt = deepcopy(dt)
vcat(dt, alt_dt)
Expand All @@ -88,29 +88,13 @@ module TestCat
dt[1] = zeros(Int, nrow(dt))
vcat(dt, alt_dt)

# Don't fail on non-matching names
names!(alt_dt, [:A, :B, :C])
vcat(dt, alt_dt)

dtr = vcat(dt4, dt4)
@test size(dtr, 1) == 8
@test names(dt4) == names(dtr)
@test isequal(dtr, [dt4; dt4])

dtr = vcat(dt2, dt3)
@test size(dtr) == (8,2)
@test names(dt2) == names(dtr)
@test isnull(dtr[8,:x2])

# Eltype promotion
# Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
if VERSION >= v"0.5.0-dev"
@test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}]
@test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}]
else
@test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Any}]
@test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}]
end
@test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}]
@test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}]

# Minimal container type promotion
dta = DataTable(a = CategoricalArray([1, 2, 2]))
Expand All @@ -122,12 +106,7 @@ module TestCat
@test isequal(dtab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
@test isequal(dtac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
@test isa(dtab[:a], NullableCategoricalVector{Int})
# Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
if VERSION >= v"0.5.0-dev"
@test isa(dtac[:a], NullableCategoricalVector{Int})
else
@test isa(dtac[:a], NullableCategoricalVector{Any})
end
@test isa(dtac[:a], NullableCategoricalVector{Int})
# ^^ container may flip if container promotion happens in Base/DataArrays
dc = vcat(dtd, dtc)
@test isequal(vcat(dtc, dtd), dc)
Expand All @@ -137,15 +116,120 @@ module TestCat
@test isequal(vcat(dtd, dtc0, dtc), dc)
@test eltypes(vcat(dtd, dtc0)) == eltypes(dc)

# Missing columns
rename!(dtd, :a, :b)
dtda = DataTable(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]),
a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
@test isequal(vcat(dtd, dta), dtda)

# Alignment
@test isequal(vcat(dtda, dtd, dta), vcat(dtda, dtda))

# vcat should be able to concatenate different implementations of AbstractDataTable (PR #944)
@test isequal(vcat(view(DataTable(A=1:3),2),DataTable(A=4:5)), DataTable(A=[2,4,5]))

@testset "vcat >2 args" begin
@test vcat(DataTable(), DataTable(), DataTable()) == DataTable()
dt = DataTable(x = trues(1), y = falses(1))
@test vcat(dt, dt, dt) == DataTable(x = trues(3), y = falses(3))
end

@testset "vcat mixed coltypes" begin
drf = CategoricalArrays.DefaultRefType
dt = vcat(DataTable([[1]], [:x]), DataTable([[1.0]], [:x]))
@test dt == DataTable([[1.0, 1.0]], [:x])
@test typeof.(dt.columns) == [Vector{Float64}]
dt = vcat(DataTable([[1]], [:x]), DataTable([["1"]], [:x]))
@test dt == DataTable([[1, "1"]], [:x])
@test typeof.(dt.columns) == [Vector{Any}]
dt = vcat(DataTable([NullableArray([1])], [:x]), DataTable([[1]], [:x]))
@test dt == DataTable([NullableArray([1, 1])], [:x])
@test typeof.(dt.columns) == [NullableVector{Int}]
dt = vcat(DataTable([CategoricalArray([1])], [:x]), DataTable([[1]], [:x]))
@test dt == DataTable([CategoricalArray([1, 1])], [:x])
@test typeof.(dt.columns) == [CategoricalVector{Int, drf}]
dt = vcat(DataTable([CategoricalArray([1])], [:x]),
DataTable([NullableArray([1])], [:x]))
@test dt == DataTable([NullableCategoricalArray([1, 1])], [:x])
@test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}]
dt = vcat(DataTable([CategoricalArray([1])], [:x]),
DataTable([NullableCategoricalArray([1])], [:x]))
@test dt == DataTable([NullableCategoricalArray([1, 1])], [:x])
@test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}]
dt = vcat(DataTable([NullableArray([1])], [:x]),
DataTable([NullableArray(["1"])], [:x]))
@test dt == DataTable([NullableArray([1, "1"])], [:x])
@test typeof.(dt.columns) == [NullableVector{Any}]
dt = vcat(DataTable([CategoricalArray([1])], [:x]),
DataTable([CategoricalArray(["1"])], [:x]))
@test dt == DataTable([CategoricalArray([1, "1"])], [:x])
@test typeof.(dt.columns) == [CategoricalVector{Any, drf}]
dt = vcat(DataTable([trues(1)], [:x]), DataTable([[false]], [:x]))
@test dt == DataTable([[true, false]], [:x])
@test typeof.(dt.columns) == [Vector{Bool}]
end

@testset "vcat errors" begin
err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[]))
@test err.value.msg == "column(s) x are missing from argument(s) 1 and 2"
err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[1]))
@test err.value.msg == "column(s) x are missing from argument(s) 1 and 2"
dt1 = DataTable(A = 1:3, B = 1:3)
dt2 = DataTable(A = 1:3)
# right missing 1 column
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column(s) B are missing from argument(s) 2"
# left missing 1 column
err = @test_throws ArgumentError vcat(dt2, dt1)
@test err.value.msg == "column(s) B are missing from argument(s) 1"
# multiple missing 1 column
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2)
@test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6"
# argument missing >1 columns
dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3)
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2"
# >1 arguments missing >1 columns
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2)
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5"
# out of order
dt2 = dt1[reverse(names(dt1))]
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2"
# first group >1 arguments
err = @test_throws ArgumentError vcat(dt1, dt1, dt2)
@test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3"
# second group >1 arguments
err = @test_throws ArgumentError vcat(dt1, dt2, dt2)
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3"
# first and second groups >1 argument
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2)
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6"
# >2 groups out of order
srand(1)
dt3 = dt1[shuffle(names(dt1))]
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3)
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10"
# missing columns throws error before out of order columns
dt1 = DataTable(A = 1, B = 1)
dt2 = DataTable(A = 1)
dt3 = DataTable(B = 1, A = 1)
err = @test_throws ArgumentError vcat(dt1, dt2, dt3)
@test err.value.msg == "column(s) B are missing from argument(s) 2"
# unique columns for both sides
dt1 = DataTable(A = 1, B = 1, C = 1, D = 1)
dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1)
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2"
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4"
dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1)
err = @test_throws ArgumentError vcat(dt1, dt2, dt3)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3"
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6"
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9"
# dt4 is a superset of names found in all other datatables and won't be shown in error
dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1)
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3"
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6"
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9"
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11"
end
end
2 changes: 1 addition & 1 deletion test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ module TestGrouping
groupby(dt, [:v1, :v2])

dt2 = by(e->1, DataTable(x=Int64[]), :x)
@test size(dt2) == (0,1)
@test size(dt2) == (0,2)
@test isequal(sum(dt2[:x]), Nullable(0))

# Check that reordering levels does not confuse groupby
Expand Down