diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 0bc4656..d52e602 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -9,13 +9,10 @@ similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableArray{eltype(T)}(dims) -similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = +similar_nullable{T}(dv::CategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableCategoricalArray{T}(dims) -similar_nullable(dt::AbstractDataTable, dims::Int) = - DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) - -similar_nullable{T,R}(dv::NullableCategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = +similar_nullable{T}(dv::NullableCategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableCategoricalArray{T}(dims) # helper structure for DataTables joining @@ -47,31 +44,28 @@ Base.length(x::RowIndexMap) = length(x.orig) # composes the joined data table using the maps between the left and right # table rows and the indices of rows in the result -function compose_joined_table(joiner::DataTableJoiner, +function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) @assert length(left_ixs) == length(right_ixs) # compose left half of the result taking all left columns all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) - if length(leftonly_ixs) > 0 + + ril = length(right_ixs) + lil = length(left_ixs) + loil = length(leftonly_ixs) + roil = length(rightonly_ixs) + + if loil > 0 # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows # preserving the original rows order - all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs)) + all_orig_left_ixs = similar(left_ixs.orig, lil + loil) @inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig @inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig else # the result contains only the left rows that are matched to right rows (left_ixs) all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere end - ril = length(right_ixs) - loil = length(leftonly_ixs) - roil = length(rightonly_ixs) - left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil) - for col in columns(joiner.dtl)], - names(joiner.dtl)) - - # compose right half of the result taking all right columns excluding on - dtr_noon = without(joiner.dtr, joiner.on_cols) # permutation to swap rightonly and leftonly rows right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) if length(leftonly_ixs) > 0 @@ -79,23 +73,45 @@ function compose_joined_table(joiner::DataTableJoiner, right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) - right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm] - for col in columns(dtr_noon)], - names(dtr_noon)) - # merge left and right parts of the joined table - res = hcat!(left_dt, right_dt) + + # compose right half of the result taking all right columns excluding on + dtr_noon = without(joiner.dtr, joiner.on_cols) + + nrow = length(all_orig_left_ixs) + roil + @assert nrow == length(all_orig_right_ixs) + loil + ncleft = ncol(joiner.dtl) + cols = Vector{Any}(ncleft + ncol(dtr_noon)) + _similar = kind == :inner ? similar : similar_nullable + for (i, col) in enumerate(columns(joiner.dtl)) + cols[i] = _similar(col, nrow) + fillcolumn!(cols[i], col, all_orig_left_ixs) + end + for (i, col) in enumerate(columns(dtr_noon)) + cols[i+ncleft] = _similar(col, nrow) + fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs) + permute!(cols[i+ncleft], right_perm) + end + res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) if length(rightonly_ixs.join) > 0 # some left rows are nulls, so the values of the "on" columns # need to be taken from the right for (on_col_ix, on_col) in enumerate(joiner.on_cols) # fix the result of the rightjoin by taking the nonnull values from the right table - res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] + offset = nrow - length(rightonly_ixs.orig) + fillcolumn!(res[on_col], joiner.dtr_on[on_col_ix], rightonly_ixs.orig, offset) end end return res end +function fillcolumn!{T1, T2}(dtcol::AbstractVector{T1}, refcol::AbstractVector{T2}, + indices::Vector{Int}, offset::Int=0) + @inbounds for (j, k) in enumerate(indices) + dtcol[j+offset] = refcol[k] + end +end + # map the indices of the left and right joined tables # to the indices of the rows in the resulting table # if `nothing` is given, the corresponding map is not built @@ -210,7 +226,8 @@ join(dt1::AbstractDataTable, - `:cross` : a full Cartesian product of the key combinations; every row of `dt1` is matched with every row of `dt2` -Null values are filled in where needed to complete joins. +For the three join operations that may introduce missing values (`:outer`, `:left`, +and `:right`), all columns of the returned data table will be nullable. ### Result @@ -246,22 +263,21 @@ function Base.join(dt1::AbstractDataTable, joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, false, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right - right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on, - group_rows(joiner.dtl_on), - true, true, true, false) - compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtr_on, joiner.dtl_on, + group_rows(joiner.dtl_on), + true, true, true, false)[[3, 4, 1, 2]]...) elseif kind == :outer - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, true)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi # hash the right rows dtr_on_grp = group_rows(joiner.dtr_on) diff --git a/test/join.jl b/test/join.jl index 0ac3fe6..a656089 100644 --- a/test/join.jl +++ b/test/join.jl @@ -116,5 +116,205 @@ module TestJoin @test join(dt, dtnull, on = :x) == DataTable([collect(1:10), collect(2:11), NullableArray(3:12)], [:x, :y, :z]) @test join(dtnull, dt, on = :x) == - DataTable([NullableArray(1:10), NullableArray(3:12), NullableArray(2:11)], [:x, :z, :y]) + DataTable([NullableArray(1:10), NullableArray(3:12), collect(2:11)], [:x, :z, :y]) + + @testset "all joins" begin + dt1 = DataTable(Any[[1, 3, 5], [1.0, 3.0, 5.0]], [:id, :fid]) + dt2 = DataTable(Any[[0, 1, 2, 3, 4], [0.0, 1.0, 2.0, 3.0, 4.0]], [:id, :fid]) + N = Nullable() + + @test join(dt1, dt2, kind=:cross) == + DataTable(Any[repeat([1, 3, 5], inner = 5), + repeat([1, 3, 5], inner = 5), + repeat([0, 1, 2, 3, 4], outer = 3), + repeat([0, 1, 2, 3, 4], outer = 3)], + [:id, :fid, :id_1, :fid_1]) + @test typeof.(join(dt1, dt2, kind=:cross).columns) == + [Vector{Int}, Vector{Float64}, Vector{Int}, Vector{Float64}] + + i(on) = join(dt1, dt2, on = on, kind = :inner) + l(on) = join(dt1, dt2, on = on, kind = :left) + r(on) = join(dt1, dt2, on = on, kind = :right) + o(on) = join(dt1, dt2, on = on, kind = :outer) + s(on) = join(dt1, dt2, on = on, kind = :semi) + a(on) = join(dt1, dt2, on = on, kind = :anti) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(s(:id).columns) == + typeof.(s(:fid).columns) == + typeof.(s([:id, :fid]).columns) == [Vector{Int}, Vector{Float64}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataTable(Any[[5], [5]], [:id, :fid]) + @test typeof.(a(:id).columns) == + typeof.(a(:fid).columns) == + typeof.(a([:id, :fid]).columns) == [Vector{Int}, Vector{Float64}] + + on = :id + @test i(on) == DataTable(Any[[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1]) + @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Float64}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + fid_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Float64}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, N, N, N]), + fid_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Float64}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, N, N, N]), + fid_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Float64}] + + on = :fid + @test i(on) == DataTable(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) + @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Int}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + id_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Int}] + @test r(on) == DataTable(id = NullableArray([1, 3, N, N, N]), + fid = NullableArray([1, 3, 0, 2, 4]), + id_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Int}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, N, N, N]), + fid = NullableArray([1, 3, 5, 0, 2, 4]), + id_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Int}] + + on = [:id, :fid] + @test i(on) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5])) + @test typeof.(l(on).columns) == [NullableVector{Int}, + NullableVector{Float64}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableVector{Int}, + NullableVector{Float64}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableVector{Int}, + NullableVector{Float64}] + end + + @testset "all joins with CategoricalArrays" begin + dt1 = DataTable(Any[CategoricalArray([1, 3, 5]), + CategoricalArray([1.0, 3.0, 5.0])], [:id, :fid]) + dt2 = DataTable(Any[CategoricalArray([0, 1, 2, 3, 4]), + CategoricalArray([0.0, 1.0, 2.0, 3.0, 4.0])], [:id, :fid]) + N = Nullable() + DRT = CategoricalArrays.DefaultRefType + + @test join(dt1, dt2, kind=:cross) == + DataTable(Any[repeat([1, 3, 5], inner = 5), + repeat([1, 3, 5], inner = 5), + repeat([0, 1, 2, 3, 4], outer = 3), + repeat([0, 1, 2, 3, 4], outer = 3)], + [:id, :fid, :id_1, :fid_1]) + @test typeof.(join(dt1, dt2, kind=:cross).columns) == + [CategoricalVector{i, DRT} for i in [Int, Float64, Int, Float64]] + + i(on) = join(dt1, dt2, on = on, kind = :inner) + l(on) = join(dt1, dt2, on = on, kind = :left) + r(on) = join(dt1, dt2, on = on, kind = :right) + o(on) = join(dt1, dt2, on = on, kind = :outer) + s(on) = join(dt1, dt2, on = on, kind = :semi) + a(on) = join(dt1, dt2, on = on, kind = :anti) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(s(:id).columns) == + typeof.(s(:fid).columns) == + typeof.(s([:id, :fid]).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataTable(Any[[5], [5]], [:id, :fid]) + @test typeof.(a(:id).columns) == + typeof.(a(:fid).columns) == + typeof.(a([:id, :fid]).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}] + + on = :id + @test i(on) == DataTable(Any[[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1]) + @test typeof.(i(on).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}, + CategoricalVector{Float64, DRT}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + fid_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, N, N, N]), + fid_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, N, N, N]), + fid_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Float64, DRT}] + + on = :fid + @test i(on) == DataTable(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) + @test typeof.(i(on).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}, + CategoricalVector{Int, DRT}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + id_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Int, DRT}] + @test r(on) == DataTable(id = NullableArray([1, 3, N, N, N]), + fid = NullableArray([1, 3, 0, 2, 4]), + id_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Int, DRT}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, N, N, N]), + fid = NullableArray([1, 3, 5, 0, 2, 4]), + id_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Int, DRT}] + + on = [:id, :fid] + @test i(on) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(i(on).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5])) + @test typeof.(l(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}] + end end