From 1509f63657395971e19a2725a0e7022ff8a6f96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 27 Jan 2021 00:06:42 +0100 Subject: [PATCH 01/59] implement faster innerjoin --- src/abstractdataframe/join.jl | 89 +++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 0e391d4c99..1bcadebadf 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -91,6 +91,89 @@ _rename_cols(old_names::AbstractVector{Symbol}, (renamecols isa Function ? Symbol(renamecols(string(n))) : Symbol(n, renamecols)) for n in old_names] +prepare_on_col() = throw(ArgumentError("at least one on column required when joining")) +prepare_on_col(c::AbstractVector) = c +prepare_on_col(cs::AbstractVector...) = tuple.(cs...) + +function compose_inner_table(joiner::DataFrameJoiner, + makeunique::Bool, + left_rename::Union{Function, AbstractString, Symbol}, + right_rename::Union{Function, AbstractString, Symbol}) + + left_col = prepare_on_col(eachcol(joiner.dfl_on)...) + right_col = prepare_on_col(eachcol(joiner.dfr_on)...) + + if length(right_col) <= length(left_col) + left_ixs, right_ixs = _innerjoin(left_col, right_col) + else + right_ixs, left_ixs = _innerjoin(right_col, left_col) + end + + dfl = joiner.dfl[left_ixs, :] + dfr_noon = joiner.dfr[right_ixs, Not(joiner.right_on)] + + ncleft = ncol(dfl) + cols = Vector{AbstractVector}(undef, ncleft + ncol(dfr_noon)) + + for (i, col) in enumerate(eachcol(dfl)) + cols[i] = col + end + for (i, col) in enumerate(eachcol(dfr_noon)) + cols[i+ncleft] = col + end + + new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), + _rename_cols(_names(dfr_noon), right_rename)) + res = DataFrame(cols, new_names, makeunique=makeunique, copycols=false) + + return res, nothing, nothing +end + +# optimistically assume that shorter table does not have duplicates in on column +function _innerjoin(left::AbstractArray, right::AbstractArray{T}) where {T} + left_ixs = Int[] + right_ixs = Int[] + dict = Dict{T, Int}() + + for (idx_r, val_r) in enumerate(right) + dict_index = Base.ht_keyindex2!(dict, val_r) + dict_index > 0 && return _innerjoin_dup(left, right) + Base._setindex!(dict, idx_r, val_r, -dict_index) + end + + for (idx_l, val_l) in enumerate(left) + dict_index = Base.ht_keyindex(dict, val_l) + if dict_index > 0 # -1 if key not found + @inbounds idx_r = dict.vals[dict_index] + push!(left_ixs, idx_l) + push!(right_ixs, idx_r) + end + end + return left_ixs, right_ixs +end + +# we fall back to general case if we have duplicates +function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}) where {T} + left_ixs = Int[] + right_ixs = Int[] + dict = Dict{T, Vector{Int}}() + + for (idx_r, val_r) in enumerate(right) + push!(get!(Vector{Int}, dict, val_r), idx_r) + end + + @inbounds for (idx_l, val_l) in enumerate(left) + dict_index = Base.ht_keyindex(dict, val_l) + if dict_index > 0 # -1 if key not found + @inbounds idxs_r = dict.vals[dict_index] + append!(left_ixs, Iterators.repeated(idx_l, length(idxs_r))) + append!(right_ixs, idxs_r) + end + end + + return left_ixs, right_ixs +end + function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap, @@ -383,12 +466,8 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; left_indicator, right_indicator = nothing, nothing if kind == :inner - inner_row_maps = update_row_maps!(joiner.dfl_on, joiner.dfr_on, - group_rows(joiner.dfr_on), - true, false, true, false) joined, left_indicator, right_indicator = - compose_joined_table(joiner, kind, inner_row_maps..., - makeunique, left_rename, right_rename, nothing) + compose_inner_table(joiner, makeunique, left_rename, right_rename) elseif kind == :left left_row_maps = update_row_maps!(joiner.dfl_on, joiner.dfr_on, group_rows(joiner.dfr_on), From 2b222b7613db69d7ef5961565acc8c6aef46d236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 27 Jan 2021 11:33:00 +0100 Subject: [PATCH 02/59] add handling of sorted tables --- src/abstractdataframe/join.jl | 114 ++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 6 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 1bcadebadf..42f055b37d 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -103,10 +103,32 @@ function compose_inner_table(joiner::DataFrameJoiner, left_col = prepare_on_col(eachcol(joiner.dfl_on)...) right_col = prepare_on_col(eachcol(joiner.dfr_on)...) - if length(right_col) <= length(left_col) - left_ixs, right_ixs = _innerjoin(left_col, right_col) + if isempty(left_col) || isempty(right_col) + # we treat this case separately so we know we have at least one element later + left_ixs, right_ixs = Int[], Int[] else - right_ixs, left_ixs = _innerjoin(right_col, left_col) + both_sorted = false + try + # the isless, isequal and isconcretetype tests are to make sure + # that if we use the fast path for sorted vectors we do not hit + # the problem that some entries are not comparable + isequal(left_col[1], right_col[1]) + isless(left_col[1], right_col[1]) + if isconcretetype(left_col) && isconcretetype(right_col) && + issorted(left_col) && issorted(right_col) + both_sorted = true + end + catch + # nothing to do - one of the columns is not sortable + end + + if both_sorted + left_ixs, right_ixs = _innerjoin_sorted(left_col, right_col) + elseif length(right_col) <= length(left_col) + left_ixs, right_ixs = _innerjoin_unsorted(left_col, right_col) + else + right_ixs, left_ixs = _innerjoin_unsorted(right_col, left_col) + end end dfl = joiner.dfl[left_ixs, :] @@ -129,8 +151,72 @@ function compose_inner_table(joiner::DataFrameJoiner, return res, nothing, nothing end +@inline function find_next_range(x::AbstractArray, start::Int, start_value) + local stop_value + n = length(x) + stop = start + 1 + while stop <= n + stop_value = x[stop] + if isequal(start_value, stop_value) + stop += 1 + else + return stop, stop_value + end + end + return stop, start_value +end + +function _innerjoin_sorted(left::AbstractArray, right::AbstractArray) + left_n = length(left) + right_n = length(right) + + left_ixs = Int[] + right_ixs = Int[] + + (left_n == 0 || right_n == 0) && return left_ixs, right_ixs + + # lower bound assuming we get matches + sizehint!(left_ixs, min(left_n, right_n)) + sizehint!(right_ixs, min(left_n, right_n)) + + left_cur = 1 + left_val = left[left_cur] + left_new, left_tmp = find_next_range(left, left_cur, left_val) + + right_cur = 1 + right_val = right[right_cur] + right_new, right_tmp = find_next_range(right, right_cur, right_val) + + while left_cur <= left_n && right_cur <= right_n + if isequal(left_val, right_val) + if left_new - left_cur == right_new - right_cur == 2 + push!(left_ixs, left_cur) + push!(right_ixs, right_cur) + else + for (left_i, right_i) in Iterators.product(left_cur:left_new - 1, + right_cur:right_new - 1) + push!(left_ixs, left_i) + push!(right_ixs, right_i) + end + end + left_cur, left_val = left_new, left_tmp + left_new, left_tmp = find_next_range(left, left_cur, left_val) + right_cur, right_val = right_new, right_tmp + right_new, right_tmp = find_next_range(right, right_cur, right_val) + elseif isless(left_val, right_val) + left_cur, left_val = left_new, left_tmp + left_new, left_tmp = find_next_range(left, left_cur, left_val) + else + right_cur, right_val = right_new, right_tmp + right_new, right_tmp = find_next_range(right, right_cur, right_val) + end + end + + return left_ixs, right_ixs +end + # optimistically assume that shorter table does not have duplicates in on column -function _innerjoin(left::AbstractArray, right::AbstractArray{T}) where {T} +function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where {T} left_ixs = Int[] right_ixs = Int[] dict = Dict{T, Int}() @@ -153,21 +239,37 @@ function _innerjoin(left::AbstractArray, right::AbstractArray{T}) where {T} end # we fall back to general case if we have duplicates +# normally it should happen fast function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}) where {T} left_ixs = Int[] right_ixs = Int[] + + # lower bound assuming we get matches + sizehint!(left_ixs, length(right)) + sizehint!(right_ixs, length(right)) + dict = Dict{T, Vector{Int}}() for (idx_r, val_r) in enumerate(right) push!(get!(Vector{Int}, dict, val_r), idx_r) end + n = 0 @inbounds for (idx_l, val_l) in enumerate(left) dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found @inbounds idxs_r = dict.vals[dict_index] - append!(left_ixs, Iterators.repeated(idx_l, length(idxs_r))) - append!(right_ixs, idxs_r) + l = length(idxs_r) + newn = n + l + resize!(left_ixs, newn) + @simd for i in n+1:n+l + @inbounds left_ixs[i] = idx_l + end + resize!(right_ixs, newn) + @simd for i in 1:l + @inbounds right_ixs[n+i] = idxs_r[i] + end + n = newn end end From 0eb911ec0bea5f418893cc4f5131f53f974c8e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 27 Jan 2021 12:04:55 +0100 Subject: [PATCH 03/59] fix eltype test --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 42f055b37d..213da98c97 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -114,7 +114,7 @@ function compose_inner_table(joiner::DataFrameJoiner, # the problem that some entries are not comparable isequal(left_col[1], right_col[1]) isless(left_col[1], right_col[1]) - if isconcretetype(left_col) && isconcretetype(right_col) && + if isconcretetype(eltype(left_col)) && isconcretetype(eltype(right_col)) && issorted(left_col) && issorted(right_col) both_sorted = true end From a16b6f29b9ffa521b5ecd41f710c460f0c90d1c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 28 Jan 2021 15:25:50 +0100 Subject: [PATCH 04/59] use strategy with single index pool in case of duplicates --- src/abstractdataframe/join.jl | 68 +++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 213da98c97..ea2f06e123 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -217,16 +217,17 @@ end # optimistically assume that shorter table does not have duplicates in on column function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where {T} - left_ixs = Int[] - right_ixs = Int[] dict = Dict{T, Int}() for (idx_r, val_r) in enumerate(right) dict_index = Base.ht_keyindex2!(dict, val_r) - dict_index > 0 && return _innerjoin_dup(left, right) + dict_index > 0 && return _innerjoin_dup(left, right, dict, idx_r) Base._setindex!(dict, idx_r, val_r, -dict_index) end + left_ixs = Int[] + right_ixs = Int[] + for (idx_l, val_l) in enumerate(left) dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found @@ -240,26 +241,65 @@ end # we fall back to general case if we have duplicates # normally it should happen fast -function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}) where {T} +function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict{T, Int}, idx_r_start::Int) where {T} + ngroups = idx_r_start - 1 + right_len = length(right) + groups = Vector{Int}(undef, right_len) + groups[1:ngroups] = 1:ngroups + + for idx_r in idx_r_start:right_len + @inbounds val_r = right[idx_r] + dict_index = Base.ht_keyindex(dict, val_r) + if dict_index > 0 + @inbounds groups[idx_r] = dict.vals[dict_index] + else + ngroups += 1 + @inbounds groups[idx_r] = ngroups + Base._setindex!(dict, idx_r, val_r, -dict_index) + end + end + + @assert ngroups > 0 # we should not get here with 0-length right + return _innerjoin_postprocess(left, dict, groups, ngroups, right_len) +end + +function compute_join_indices!(groups::Vector{Int}, ngroups::Int, + starts::Vector, rperm::Vector) + @inbounds for gix in groups + starts[gix] += 1 + end + + cumsum!(starts, starts) + + @inbounds for (i, gix) in enumerate(groups) + rperm[starts[gix]] = i + starts[gix] -= 1 + end + push!(starts, length(groups)) + return nothing +end + +function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, + groups::Vector{Int}, ngroups::Int, right_len::Int) where {T} + starts = zeros(Int, ngroups) + rperm = Vector{Int}(undef, right_len) + left_ixs = Int[] right_ixs = Int[] # lower bound assuming we get matches - sizehint!(left_ixs, length(right)) - sizehint!(right_ixs, length(right)) - - dict = Dict{T, Vector{Int}}() + sizehint!(left_ixs, right_len) + sizehint!(right_ixs, right_len) - for (idx_r, val_r) in enumerate(right) - push!(get!(Vector{Int}, dict, val_r), idx_r) - end + compute_join_indices!(groups, ngroups, starts, rperm) n = 0 @inbounds for (idx_l, val_l) in enumerate(left) dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found - @inbounds idxs_r = dict.vals[dict_index] - l = length(idxs_r) + @inbounds group_id = dict.vals[dict_index] + @inbounds ref_stop = starts[group_id + 1] + @inbounds l = ref_stop - starts[group_id] newn = n + l resize!(left_ixs, newn) @simd for i in n+1:n+l @@ -267,7 +307,7 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}) where {T} end resize!(right_ixs, newn) @simd for i in 1:l - @inbounds right_ixs[n+i] = idxs_r[i] + @inbounds right_ixs[n + i] = rperm[ref_stop - i + 1] end n = newn end From 14652f02781966e3ce42a3f2131ec0faef5c7e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 29 Jan 2021 14:59:25 +0100 Subject: [PATCH 05/59] add tests for innerjoin --- NEWS.md | 5 +++++ src/abstractdataframe/join.jl | 6 +++--- test/join.jl | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 65c4daa400..60beb4565a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -31,6 +31,11 @@ ## Other relevant changes +* `innerjoin` is now much faster and checks if passed data frames are sorted + by the `on` columns and takes into account if shorter data frame that is joined + has unique values in `on` columns. These aspect of input data frames might affect + the order of rows produced in the output + ([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612)) # DataFrames v0.22 Release Notes diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index ea2f06e123..e34234b674 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -189,7 +189,7 @@ function _innerjoin_sorted(left::AbstractArray, right::AbstractArray) while left_cur <= left_n && right_cur <= right_n if isequal(left_val, right_val) - if left_new - left_cur == right_new - right_cur == 2 + if left_new - left_cur == right_new - right_cur == 1 push!(left_ixs, left_cur) push!(right_ixs, right_cur) else @@ -249,13 +249,13 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict for idx_r in idx_r_start:right_len @inbounds val_r = right[idx_r] - dict_index = Base.ht_keyindex(dict, val_r) + dict_index = Base.ht_keyindex2!(dict, val_r) if dict_index > 0 @inbounds groups[idx_r] = dict.vals[dict_index] else ngroups += 1 @inbounds groups[idx_r] = ngroups - Base._setindex!(dict, idx_r, val_r, -dict_index) + Base._setindex!(dict, ngroups, val_r, -dict_index) end end diff --git a/test/join.jl b/test/join.jl index ee7dd57068..620f2207a0 100644 --- a/test/join.jl +++ b/test/join.jl @@ -936,4 +936,29 @@ end innerjoin(df1_view2, df2, on=:a) end +@testset "innerjoin correctness tests" begin + function test_innerjoin(df1, df2) + @assert names(df1) == ["id", "x"] + @assert names(df2) == ["id", "y"] + + dfres = DataFrame(id=[], x=[], y=[]) + for i in axes(df1, 1), j in axes(df2, 1) + if isequal(df1.id[i], df2.id[j]) + push!(dfres, (id=df1.id[i], x=df1.x[i], y=df2.y[j])) + end + end + return sort(dfres) == sort(innerjoin(df1, df2, on=:id)) + end + + for i in 1:20, j in 1:10 + for df1 in [DataFrame(id=rand(1:i+j, i+j), x=1:i+j), DataFrame(id=rand(1:i, i), x=1:i)], + df2 in [DataFrame(id=rand(1:i+j, i+j), y=1:i+j), DataFrame(id=rand(1:i, i), y=1:i)] + for opleft = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))], + opright = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))] + @test test_innerjoin(opleft(df1), opright(df2)) + end + end + end +end + end # module From 6a5b6ca62cdde14ae4b2de42980b6ed7ea7b967c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 30 Jan 2021 00:39:05 +0100 Subject: [PATCH 06/59] fast path for PooledArrays --- src/abstractdataframe/join.jl | 43 ++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index e34234b674..84089e4398 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -95,13 +95,50 @@ prepare_on_col() = throw(ArgumentError("at least one on column required when joi prepare_on_col(c::AbstractVector) = c prepare_on_col(cs::AbstractVector...) = tuple.(cs...) +# in map2refs zero(V) is PooledArray.jl specific +function map2refs(x::AbstractVector{T}, ref::PooledArray{T,V,1}) where {T, V} + refip = ref.invpool + return [get(refip, v, zero(V)) for v in x] +end + +function map2refs(x::PooledArray{1, T}, ref::PooledArray{T,V,1}) where {T, V} + refip = ref.invpool + mapping = [get(refip, v, zero(V)) for v in x.pool] + return @inbounds [mapping[r] for r in x.refs] +end + function compose_inner_table(joiner::DataFrameJoiner, makeunique::Bool, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}) - left_col = prepare_on_col(eachcol(joiner.dfl_on)...) - right_col = prepare_on_col(eachcol(joiner.dfr_on)...) + right_shorter = length(joiner.dfr_on[!, 1]) <= length(joiner.dfl_on[!, 1]) + + left_cols = collect(eachcol(joiner.dfl_on)) + right_cols = collect(eachcol(joiner.dfr_on)) + + if right_shorter + for i in eachindex(left_cols, right_cols) + rc = right_cols[i] + lc = left_cols[i] + if lc isa PooledArray && eltype(lc) == eltype(rc) + right_cols[i] = map2refs(rc, lc) + left_cols[i] = lc.refs + end + end + else + for i in eachindex(left_cols, right_cols) + rc = right_cols[i] + lc = left_cols[i] + if rc isa PooledArray && eltype(lc) == eltype(rc) + left_cols[i] = map2refs(lc, rc) + right_cols[i] = rc.refs + end + end + end + + left_col = prepare_on_col(left_cols...) + right_col = prepare_on_col(right_cols...) if isempty(left_col) || isempty(right_col) # we treat this case separately so we know we have at least one element later @@ -124,7 +161,7 @@ function compose_inner_table(joiner::DataFrameJoiner, if both_sorted left_ixs, right_ixs = _innerjoin_sorted(left_col, right_col) - elseif length(right_col) <= length(left_col) + elseif right_shorter left_ixs, right_ixs = _innerjoin_unsorted(left_col, right_col) else right_ixs, left_ixs = _innerjoin_unsorted(right_col, left_col) From c9385daa3500d4348e716bd2e3b3254aded7846b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 30 Jan 2021 11:46:57 +0100 Subject: [PATCH 07/59] update handling of PooledArrays and CategoricalArrays --- src/abstractdataframe/join.jl | 55 +++++++++++++++++++++++++---------- test/join.jl | 21 ++++++++++++- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 84089e4398..7d02b667fe 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -96,12 +96,12 @@ prepare_on_col(c::AbstractVector) = c prepare_on_col(cs::AbstractVector...) = tuple.(cs...) # in map2refs zero(V) is PooledArray.jl specific -function map2refs(x::AbstractVector{T}, ref::PooledArray{T,V,1}) where {T, V} +function map2refs(x::AbstractVector, ref::PooledArray{T,V,1}) where {T, V} refip = ref.invpool return [get(refip, v, zero(V)) for v in x] end -function map2refs(x::PooledArray{1, T}, ref::PooledArray{T,V,1}) where {T, V} +function map2refs(x::PooledVector, ref::PooledArray{T,V,1}) where {T, V} refip = ref.invpool mapping = [get(refip, v, zero(V)) for v in x.pool] return @inbounds [mapping[r] for r in x.refs] @@ -117,11 +117,13 @@ function compose_inner_table(joiner::DataFrameJoiner, left_cols = collect(eachcol(joiner.dfl_on)) right_cols = collect(eachcol(joiner.dfr_on)) + disallow_sorted = false + if right_shorter for i in eachindex(left_cols, right_cols) rc = right_cols[i] lc = left_cols[i] - if lc isa PooledArray && eltype(lc) == eltype(rc) + if lc isa PooledArray right_cols[i] = map2refs(rc, lc) left_cols[i] = lc.refs end @@ -130,13 +132,32 @@ function compose_inner_table(joiner::DataFrameJoiner, for i in eachindex(left_cols, right_cols) rc = right_cols[i] lc = left_cols[i] - if rc isa PooledArray && eltype(lc) == eltype(rc) - left_cols[i] = map2refs(lc, rc) + if rc isa PooledArray right_cols[i] = rc.refs + left_cols[i] = map2refs(lc, rc) end + + # this is a workaround for https://github.com/JuliaData/CategoricalArrays.jl/issues/319 + rct = typeof(right_cols[i]) + lct = typeof(left_cols[i]) + rcat = rct.name === :CategoricalArray && nameof(rct.module) === :CategoricalArrays + lcat = lct.name === :CategoricalArray && nameof(lct.module) === :CategoricalArrays + disallow_sorted |= rcat ⊻ lcat end end + # TODO: if PooledArrays are found potentially the following optimizations can be done: + # 1. identify rows in shorter table that should be dropped + # 2. develop custom _innerjoin_sorted and _innerjoin_unsorted that + # drop rows from shorter table that do not match rows from longer table based on + # PooledArray refpool check + # this optimization significantly complicates the code (especially sorted path). + # It should be added if in practice we find that the use case is often enough + # and that the benefits are significant. The two cases when the benefits should + # be expected are: + # 1. Shorter table is sorted when we drop rows not matching longer table rows + # 2. Shorter table does not have duplicates when we drop rows not matching longer table rows + left_col = prepare_on_col(left_cols...) right_col = prepare_on_col(right_cols...) @@ -145,18 +166,20 @@ function compose_inner_table(joiner::DataFrameJoiner, left_ixs, right_ixs = Int[], Int[] else both_sorted = false - try - # the isless, isequal and isconcretetype tests are to make sure - # that if we use the fast path for sorted vectors we do not hit - # the problem that some entries are not comparable - isequal(left_col[1], right_col[1]) - isless(left_col[1], right_col[1]) - if isconcretetype(eltype(left_col)) && isconcretetype(eltype(right_col)) && - issorted(left_col) && issorted(right_col) - both_sorted = true + if disallow_sorted + try + # the isless, isequal and isconcretetype tests are to make sure + # that if we use the fast path for sorted vectors we do not hit + # the problem that some entries are not comparable + isequal(left_col[1], right_col[1]) + isless(left_col[1], right_col[1]) + if isconcretetype(eltype(left_col)) && isconcretetype(eltype(right_col)) && + issorted(left_col) && issorted(right_col) + both_sorted = true + end + catch + # nothing to do - one of the columns is not sortable end - catch - # nothing to do - one of the columns is not sortable end if both_sorted diff --git a/test/join.jl b/test/join.jl index 620f2207a0..8c91a22267 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1,6 +1,6 @@ module TestJoin -using Test, DataFrames, Random, CategoricalArrays +using Test, DataFrames, Random, CategoricalArrays, PooledArrays using DataFrames: similar_missing const ≅ = isequal @@ -956,6 +956,25 @@ end for opleft = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))], opright = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))] @test test_innerjoin(opleft(df1), opright(df2)) + @test test_innerjoin(opleft(df1), opright(rename(df1, :x => :y))) + + df1p = copy(opleft(df1)) + df1p[!, 1] = PooledArray(df1p[!, 1]) + df2p = copy(opleft(df2)) + df2p[!, 1] = PooledArray(df2p[!, 1]) + @test test_innerjoin(df1, df2p) + @test test_innerjoin(df1p, df2) + @test test_innerjoin(df1p, df2p) + @test test_innerjoin(df1p, rename(df1p, :x => :y)) + + df1c = copy(opleft(df1)) + df1c[!, 1] = categorical(df1c[!, 1]) + df2c = copy(opleft(df2)) + df2c[!, 1] = categorical(df2c[!, 1]) + @test test_innerjoin(df1, df2c) + @test test_innerjoin(df1c, df2c) + @test test_innerjoin(df1c, df2) + @test test_innerjoin(df1c, rename(df1c, :x => :y)) end end end From b8907aee8ffae2fece829e69cebcd1f9a74de930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 31 Jan 2021 22:18:43 +0100 Subject: [PATCH 08/59] add more tests --- test/join.jl | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/test/join.jl b/test/join.jl index 8c91a22267..49500204ae 100644 --- a/test/join.jl +++ b/test/join.jl @@ -936,7 +936,7 @@ end innerjoin(df1_view2, df2, on=:a) end -@testset "innerjoin correctness tests" begin +@time @testset "innerjoin correctness tests" begin function test_innerjoin(df1, df2) @assert names(df1) == ["id", "x"] @assert names(df2) == ["id", "y"] @@ -947,12 +947,28 @@ end push!(dfres, (id=df1.id[i], x=df1.x[i], y=df2.y[j])) end end - return sort(dfres) == sort(innerjoin(df1, df2, on=:id)) + + df1x = copy(df1) + df1x.id2 = copy(df1x.id) + df2x = copy(df2) + df2x.id2 = copy(df2x.id) + + sort!(dfres) + dfres2 = copy(dfres) + insertcols!(dfres2, 3, :id2 => dfres2.id) + + return dfres ≅ sort(innerjoin(df1, df2, on=:id, matchmissing=:equal)) && + dfres2 ≅ sort(innerjoin(df1x, df2x, on=[:id, :id2], matchmissing=:equal)) end - for i in 1:20, j in 1:10 - for df1 in [DataFrame(id=rand(1:i+j, i+j), x=1:i+j), DataFrame(id=rand(1:i, i), x=1:i)], - df2 in [DataFrame(id=rand(1:i+j, i+j), y=1:i+j), DataFrame(id=rand(1:i, i), y=1:i)] + Random.seed!(1234) + for i in 1:10, j in 0:3 + for df1 in [DataFrame(id=rand(1:i+j, i+j), x=1:i+j), DataFrame(id=rand(1:i, i), x=1:i), + DataFrame(id=[rand(1:i+j, i+j); missing], x=1:i+j+1), + DataFrame(id=[rand(1:i, i); missing], x=1:i+1)], + df2 in [DataFrame(id=rand(1:i+j, i+j), y=1:i+j), DataFrame(id=rand(1:i, i), y=1:i), + DataFrame(id=[rand(1:i+j, i+j); missing], y=1:i+j+1), + DataFrame(id=[rand(1:i, i); missing], y=1:i+1)] for opleft = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))], opright = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))] @test test_innerjoin(opleft(df1), opright(df2)) @@ -978,6 +994,12 @@ end end end end + + @test innerjoin(DataFrame(id=[]), DataFrame(id=[]), on=:id) == DataFrame(id=[]) + @test innerjoin(DataFrame(id=[]), DataFrame(id=[1, 2, 3]), on=:id) == DataFrame(id=[]) + @test innerjoin(DataFrame(id=[1, 2, 3]), DataFrame(id=[]), on=:id) == DataFrame(id=[]) + @test innerjoin(DataFrame(id=[4, 5, 6]), DataFrame(id=[1, 2, 3]), on=:id) == DataFrame(id=[]) + @test innerjoin(DataFrame(id=[1, 2, 3]), DataFrame(id=[4, 5, 6]), on=:id) == DataFrame(id=[]) end end # module From c4d2c4650e12cebd534293693c60cbacc3c51f0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 2 Feb 2021 18:28:25 +0100 Subject: [PATCH 09/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 11 ++++------- test/join.jl | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 7d02b667fe..87b27952d3 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -216,12 +216,9 @@ end n = length(x) stop = start + 1 while stop <= n - stop_value = x[stop] - if isequal(start_value, stop_value) - stop += 1 - else - return stop, stop_value - end + @inbounds stop_value = x[stop] + isequal(start_value, stop_value) || break + stop += 1 end return stop, start_value end @@ -357,7 +354,7 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, @inbounds for (idx_l, val_l) in enumerate(left) dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found - @inbounds group_id = dict.vals[dict_index] + group_id = dict.vals[dict_index] @inbounds ref_stop = starts[group_id + 1] @inbounds l = ref_stop - starts[group_id] newn = n + l diff --git a/test/join.jl b/test/join.jl index 49500204ae..d0f9222741 100644 --- a/test/join.jl +++ b/test/join.jl @@ -936,7 +936,7 @@ end innerjoin(df1_view2, df2, on=:a) end -@time @testset "innerjoin correctness tests" begin +@testset "innerjoin correctness tests" begin function test_innerjoin(df1, df2) @assert names(df1) == ["id", "x"] @assert names(df2) == ["id", "y"] From 3f8c49f84b14974df1ec0506523f3495bb4ed37e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 2 Feb 2021 18:29:05 +0100 Subject: [PATCH 10/59] Update src/abstractdataframe/join.jl --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 87b27952d3..cccf760bd2 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -104,7 +104,7 @@ end function map2refs(x::PooledVector, ref::PooledArray{T,V,1}) where {T, V} refip = ref.invpool mapping = [get(refip, v, zero(V)) for v in x.pool] - return @inbounds [mapping[r] for r in x.refs] + return [@inbounds mapping[r] for r in x.refs] end function compose_inner_table(joiner::DataFrameJoiner, From e306de11ec564f504ad6b13e318c41b4f6bdb88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 2 Feb 2021 18:43:50 +0100 Subject: [PATCH 11/59] add more comments and optimistically try sorted join algorithm --- src/abstractdataframe/join.jl | 53 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index cccf760bd2..cc15f7398d 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -161,33 +161,35 @@ function compose_inner_table(joiner::DataFrameJoiner, left_col = prepare_on_col(left_cols...) right_col = prepare_on_col(right_cols...) + local left_ixs + local right_ixs + local already_joined + if isempty(left_col) || isempty(right_col) # we treat this case separately so we know we have at least one element later left_ixs, right_ixs = Int[], Int[] else - both_sorted = false - if disallow_sorted + already_joined = false + # if sorting is not disallowed try using a fast algorithm that works + # on sorted columns; if it is not run or errors fall back to the unsorted case + # the try-catch is used to handle the case when columns on which we join + # contain values that are not comparable + if !disallow_sorted try - # the isless, isequal and isconcretetype tests are to make sure - # that if we use the fast path for sorted vectors we do not hit - # the problem that some entries are not comparable - isequal(left_col[1], right_col[1]) - isless(left_col[1], right_col[1]) - if isconcretetype(eltype(left_col)) && isconcretetype(eltype(right_col)) && - issorted(left_col) && issorted(right_col) - both_sorted = true + if issorted(left_col) && issorted(right_col) + left_ixs, right_ixs = _innerjoin_sorted(left_col, right_col) + already_joined = true end catch # nothing to do - one of the columns is not sortable end end - - if both_sorted - left_ixs, right_ixs = _innerjoin_sorted(left_col, right_col) - elseif right_shorter - left_ixs, right_ixs = _innerjoin_unsorted(left_col, right_col) - else - right_ixs, left_ixs = _innerjoin_unsorted(right_col, left_col) + if !already_joined + if right_shorter + left_ixs, right_ixs = _innerjoin_unsorted(left_col, right_col) + else + right_ixs, left_ixs = _innerjoin_unsorted(right_col, left_col) + end end end @@ -273,10 +275,16 @@ function _innerjoin_sorted(left::AbstractArray, right::AbstractArray) end # optimistically assume that shorter table does not have duplicates in on column +# if this is not the case we call _innerjoin_dup +# which efficiently uses the work already done and continues with the more +# memory expensive algorithm that allows for duplicates function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where {T} dict = Dict{T, Int}() for (idx_r, val_r) in enumerate(right) + # we use dict_index to make sure the following two operations are fast: + # - if index is found - fall back to algorithm allowing duplicates + # - if index is not found - add it dict_index = Base.ht_keyindex2!(dict, val_r) dict_index > 0 && return _innerjoin_dup(left, right, dict, idx_r) Base._setindex!(dict, idx_r, val_r, -dict_index) @@ -286,6 +294,9 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where right_ixs = Int[] for (idx_l, val_l) in enumerate(left) + # we use dict_index to make sure the following two operations are fast: + # - if index is found - get it and process it + # - if index is not found - do nothing dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found @inbounds idx_r = dict.vals[dict_index] @@ -297,7 +308,7 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where end # we fall back to general case if we have duplicates -# normally it should happen fast +# normally it should happen fast as we reuse work already done function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict{T, Int}, idx_r_start::Int) where {T} ngroups = idx_r_start - 1 right_len = length(right) @@ -306,6 +317,9 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict for idx_r in idx_r_start:right_len @inbounds val_r = right[idx_r] + # we use dict_index to make sure the following two operations are fast: + # - if index is found - process the row with existing group number + # - if index is not found - add a new group dict_index = Base.ht_keyindex2!(dict, val_r) if dict_index > 0 @inbounds groups[idx_r] = dict.vals[dict_index] @@ -352,6 +366,9 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, n = 0 @inbounds for (idx_l, val_l) in enumerate(left) + # we use dict_index to make sure the following two operations are fast: + # - if index is found - get it and process it + # - if index is not found - do nothing dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found group_id = dict.vals[dict_index] From 47fe234b5fb673c05848818ce05421e9c433fda7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 2 Feb 2021 21:23:09 +0100 Subject: [PATCH 12/59] fix lookup --- src/abstractdataframe/join.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index cc15f7398d..01d492e8a2 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -214,7 +214,7 @@ function compose_inner_table(joiner::DataFrameJoiner, end @inline function find_next_range(x::AbstractArray, start::Int, start_value) - local stop_value + stop_value = start_value n = length(x) stop = start + 1 while stop <= n @@ -222,7 +222,7 @@ end isequal(start_value, stop_value) || break stop += 1 end - return stop, start_value + return stop, stop_value end function _innerjoin_sorted(left::AbstractArray, right::AbstractArray) From c24f6787e0f3fffe51b4e827c0a342108e960847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 3 Feb 2021 00:26:49 +0100 Subject: [PATCH 13/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 01d492e8a2..b114cae7f1 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -372,16 +372,16 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, dict_index = Base.ht_keyindex(dict, val_l) if dict_index > 0 # -1 if key not found group_id = dict.vals[dict_index] - @inbounds ref_stop = starts[group_id + 1] - @inbounds l = ref_stop - starts[group_id] + ref_stop = starts[group_id + 1] + l = ref_stop - starts[group_id] newn = n + l resize!(left_ixs, newn) - @simd for i in n+1:n+l - @inbounds left_ixs[i] = idx_l + for i in n+1:n+l + left_ixs[i] = idx_l end resize!(right_ixs, newn) - @simd for i in 1:l - @inbounds right_ixs[n + i] = rperm[ref_stop - i + 1] + for i in 1:l + right_ixs[n + i] = rperm[ref_stop - i + 1] end n = newn end From 928f37281d36a11c63250e02eed2f97f344e288e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 3 Feb 2021 11:25:35 +0100 Subject: [PATCH 14/59] use DataAPI.invrefpool --- Project.toml | 2 +- src/abstractdataframe/join.jl | 91 +++++++++++++++++++++++++---------- 2 files changed, 67 insertions(+), 26 deletions(-) diff --git a/Project.toml b/Project.toml index 56590cb137..a7e329ae30 100644 --- a/Project.toml +++ b/Project.toml @@ -30,7 +30,7 @@ DataAPI = "1.4" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2" -PooledArrays = "0.5" +PooledArrays = "1.1" PrettyTables = "0.11" Reexport = "0.1, 0.2, 1.0" SortingAlgorithms = "0.1, 0.2, 0.3" diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 01d492e8a2..44d5ccd0a4 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -95,16 +95,43 @@ prepare_on_col() = throw(ArgumentError("at least one on column required when joi prepare_on_col(c::AbstractVector) = c prepare_on_col(cs::AbstractVector...) = tuple.(cs...) -# in map2refs zero(V) is PooledArray.jl specific -function map2refs(x::AbstractVector, ref::PooledArray{T,V,1}) where {T, V} - refip = ref.invpool - return [get(refip, v, zero(V)) for v in x] +# Return if it is allowed to use refpool instead of the original array for joining. +# There are multiple conditions that must be met to allow for this. +# In particular we must be able to find a sentinel value that will be used +# in mapping to signal that the mapping to the longer column is not present +# in some rare cases it is impossible to find such a sentinel (e.g. for +# CategoricalArray with missing and having number of levels exactly equal +# to the typemax of element type of ref pool) +function check_mapping_allowed(short, long) + isempty(short) && return false, nothing + isnothing(DataAPI.refpool(long)) && return false, nothing + isnothing(DataAPI.invrefpool(long)) && return false, nothing + + T = typeof(DataAPI.refarray(long)) + T isa Union{Signed, Unsigned} || return false, nothing + sentinel = zero(T) + haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel + try + sentinel = typemin(T) + haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel + sentinel = typemax(T) + haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel + catch + # nothing to do - we could not find an appropriate sentinel + end + return false, nothing end -function map2refs(x::PooledVector, ref::PooledArray{T,V,1}) where {T, V} - refip = ref.invpool - mapping = [get(refip, v, zero(V)) for v in x.pool] - return [@inbounds mapping[r] for r in x.refs] +# in map2refs zero(V) is PooledArray.jl specific +function map2refs(x::AbstractVector, invrefpool, sentinel) + x_refpool = DataAPI.refpool(x) + + if isnothing(x_refpool) + return [get(invrefpool, v, sentinel) for v in x] + else + mapping = [get(invrefpool, v, sentinel) for v in x.pool] + return [@inbounds mapping[r] for r in x.refs] + end end function compose_inner_table(joiner::DataFrameJoiner, @@ -117,41 +144,55 @@ function compose_inner_table(joiner::DataFrameJoiner, left_cols = collect(eachcol(joiner.dfl_on)) right_cols = collect(eachcol(joiner.dfr_on)) - disallow_sorted = false - + # if column of a longer table supports DataAPI.refpool and DataAPI.invrefpool + # remap matching left and right columns to use refs if right_shorter for i in eachindex(left_cols, right_cols) rc = right_cols[i] lc = left_cols[i] - if lc isa PooledArray - right_cols[i] = map2refs(rc, lc) - left_cols[i] = lc.refs + mappingallowed, sentinel = check_mapping_allowed(rc, lc) + lc_refpool = DataAPI.refpool(lc) + lc_invrefpool = DataAPI.invrefpool(lc) + if mappingallowed + right_cols[i] = map2refs(rc, lc_invrefpool, sentinel) + left_cols[i] = lc_refpool end end else for i in eachindex(left_cols, right_cols) rc = right_cols[i] lc = left_cols[i] - if rc isa PooledArray - right_cols[i] = rc.refs - left_cols[i] = map2refs(lc, rc) + mappingallowed, sentinel = check_mapping_allowed(lc, rc) + rc_refpool = DataAPI.refpool(rc) + rc_invrefpool = DataAPI.invrefpool(rc) + if mappingallowed + right_cols[i] = rc_refpool + left_cols[i] = map2refs(lc, rc_invrefpool, sentinel) end - - # this is a workaround for https://github.com/JuliaData/CategoricalArrays.jl/issues/319 - rct = typeof(right_cols[i]) - lct = typeof(left_cols[i]) - rcat = rct.name === :CategoricalArray && nameof(rct.module) === :CategoricalArrays - lcat = lct.name === :CategoricalArray && nameof(lct.module) === :CategoricalArrays - disallow_sorted |= rcat ⊻ lcat end end - # TODO: if PooledArrays are found potentially the following optimizations can be done: + # this is a workaround for https://github.com/JuliaData/CategoricalArrays.jl/issues/319 + # this path will be triggered only in rare cases when the refpool code above + # fails to convert CategoricalArray into refpool + disallow_sorted = false + + for (lc, rc) in zip(left_cols, right_cols) + lct = typeof(lc) + rct = typeof(rc) + rcat = rct.name === :CategoricalArray && nameof(rct.module) === :CategoricalArrays + lcat = lct.name === :CategoricalArray && nameof(lct.module) === :CategoricalArrays + disallow_sorted |= rcat ⊻ lcat + end + + # TODO: + # If DataAPI.invrefpool vectors are found in the "on" columns + # then potentially the following optimizations can be done: # 1. identify rows in shorter table that should be dropped # 2. develop custom _innerjoin_sorted and _innerjoin_unsorted that # drop rows from shorter table that do not match rows from longer table based on # PooledArray refpool check - # this optimization significantly complicates the code (especially sorted path). + # This optimization significantly complicates the code (especially sorted path). # It should be added if in practice we find that the use case is often enough # and that the benefits are significant. The two cases when the benefits should # be expected are: From 9cf62af0fcd22112be13d9de7d3350390bab5e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 3 Feb 2021 14:53:01 +0100 Subject: [PATCH 15/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index a9726ec87a..65c039ba3d 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -98,16 +98,16 @@ prepare_on_col(cs::AbstractVector...) = tuple.(cs...) # Return if it is allowed to use refpool instead of the original array for joining. # There are multiple conditions that must be met to allow for this. # In particular we must be able to find a sentinel value that will be used -# in mapping to signal that the mapping to the longer column is not present -# in some rare cases it is impossible to find such a sentinel (e.g. for +# in mapping to signal that the mapping to the longer column is not present. +# In some rare cases it is impossible to find such a sentinel (e.g. for # CategoricalArray with missing and having number of levels exactly equal # to the typemax of element type of ref pool) -function check_mapping_allowed(short, long) +function check_mapping_allowed(short::AbstractVector, long::AbstractVector) isempty(short) && return false, nothing isnothing(DataAPI.refpool(long)) && return false, nothing isnothing(DataAPI.invrefpool(long)) && return false, nothing - T = typeof(DataAPI.refarray(long)) + T = eltype(DataAPI.refarray(long)) T isa Union{Signed, Unsigned} || return false, nothing sentinel = zero(T) haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel @@ -129,7 +129,7 @@ function map2refs(x::AbstractVector, invrefpool, sentinel) if isnothing(x_refpool) return [get(invrefpool, v, sentinel) for v in x] else - mapping = [get(invrefpool, v, sentinel) for v in x.pool] + mapping = [get(invrefpool, v, sentinel) for v in x_refpool] return [@inbounds mapping[r] for r in x.refs] end end From 3912ae6045105a7401a1fb6f25093488b043d2e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 3 Feb 2021 18:57:53 +0100 Subject: [PATCH 16/59] use nothing as sentinel --- src/abstractdataframe/join.jl | 79 ++++++++++++++++------------------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 65c039ba3d..9ac2b0ac32 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -81,9 +81,6 @@ end Base.length(x::RowIndexMap) = length(x.orig) -# composes the joined data table using the maps between the left and right -# table rows and the indices of rows in the result - _rename_cols(old_names::AbstractVector{Symbol}, renamecols::Union{Function, Symbol, AbstractString}, exclude::AbstractVector{Symbol} = Symbol[]) = @@ -97,41 +94,31 @@ prepare_on_col(cs::AbstractVector...) = tuple.(cs...) # Return if it is allowed to use refpool instead of the original array for joining. # There are multiple conditions that must be met to allow for this. -# In particular we must be able to find a sentinel value that will be used -# in mapping to signal that the mapping to the longer column is not present. -# In some rare cases it is impossible to find such a sentinel (e.g. for -# CategoricalArray with missing and having number of levels exactly equal -# to the typemax of element type of ref pool) -function check_mapping_allowed(short::AbstractVector, long::AbstractVector) - isempty(short) && return false, nothing - isnothing(DataAPI.refpool(long)) && return false, nothing - isnothing(DataAPI.invrefpool(long)) && return false, nothing - - T = eltype(DataAPI.refarray(long)) - T isa Union{Signed, Unsigned} || return false, nothing - sentinel = zero(T) - haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel - try - sentinel = typemin(T) - haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel - sentinel = typemax(T) - haskey(DataAPI.invrefpool(long), sentinel) || return true, sentinel - catch - # nothing to do - we could not find an appropriate sentinel +# If it is allowed we are sure that nothing can be used as a sentinel +function check_mapping_allowed(short::AbstractVector, + refarray_long::AbstractVector, + refpool_long, invrefpool_long) + if isempty(short) || + isnothing(refpool_long) || + isnothing(invrefpool_long) || + eltype(refarray_long) isa Union{Signed, Unsigned} + return false + else + return true end - return false, nothing end -# in map2refs zero(V) is PooledArray.jl specific -function map2refs(x::AbstractVector, invrefpool, sentinel) - x_refpool = DataAPI.refpool(x) +@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector) = + [@inbounds mapping[r] for r in refarray] + +map2refs(x::AbstractVector, invrefpool) = [get(invrefpool, v, nothing) for v in x] + +# this is PooledArrays.jl specific optimization as its pool is a 1-based vector +function map2refs(x::PooledVector, invrefpool) + mapping = [get(invrefpool, v, nothing) for v in x.pool] + # use function barrier as mapping is type unstable + return map_refarray(mapping, DataAPI.refarray(x)) - if isnothing(x_refpool) - return [get(invrefpool, v, sentinel) for v in x] - else - mapping = [get(invrefpool, v, sentinel) for v in x_refpool] - return [@inbounds mapping[r] for r in x.refs] - end end function compose_inner_table(joiner::DataFrameJoiner, @@ -139,7 +126,9 @@ function compose_inner_table(joiner::DataFrameJoiner, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}) - right_shorter = length(joiner.dfr_on[!, 1]) <= length(joiner.dfl_on[!, 1]) + right_len = length(joiner.dfr_on[!, 1]) + left_len = length(joiner.dfl_on[!, 1]) + right_shorter = right_len <= left_len left_cols = collect(eachcol(joiner.dfl_on)) right_cols = collect(eachcol(joiner.dfr_on)) @@ -150,24 +139,26 @@ function compose_inner_table(joiner::DataFrameJoiner, for i in eachindex(left_cols, right_cols) rc = right_cols[i] lc = left_cols[i] - mappingallowed, sentinel = check_mapping_allowed(rc, lc) + + lc_refs = DataAPI.refarray(lc) lc_refpool = DataAPI.refpool(lc) lc_invrefpool = DataAPI.invrefpool(lc) - if mappingallowed - right_cols[i] = map2refs(rc, lc_invrefpool, sentinel) - left_cols[i] = lc_refpool + if check_mapping_allowed(rc, lc_refs, lc_refpool, lc_invrefpool) + right_cols[i] = map2refs(rc, lc_invrefpool) + left_cols[i] = lc_refs end end else for i in eachindex(left_cols, right_cols) rc = right_cols[i] lc = left_cols[i] - mappingallowed, sentinel = check_mapping_allowed(lc, rc) + + rc_refs = DataAPI.refarray(rc) rc_refpool = DataAPI.refpool(rc) rc_invrefpool = DataAPI.invrefpool(rc) - if mappingallowed - right_cols[i] = rc_refpool - left_cols[i] = map2refs(lc, rc_invrefpool, sentinel) + if check_mapping_allowed(lc, rc_refs, rc_refpool, rc_invrefpool) + right_cols[i] = rc_refs + left_cols[i] = map2refs(lc, rc_invrefpool) end end end @@ -178,6 +169,8 @@ function compose_inner_table(joiner::DataFrameJoiner, disallow_sorted = false for (lc, rc) in zip(left_cols, right_cols) + @assert length(lc) == left_len + @assert length(rc) == right_len lct = typeof(lc) rct = typeof(rc) rcat = rct.name === :CategoricalArray && nameof(rct.module) === :CategoricalArrays From 68a8eaaa0987026240eb068feb310c1618697321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 3 Feb 2021 21:03:25 +0100 Subject: [PATCH 17/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 9ac2b0ac32..031b307f28 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -95,18 +95,10 @@ prepare_on_col(cs::AbstractVector...) = tuple.(cs...) # Return if it is allowed to use refpool instead of the original array for joining. # There are multiple conditions that must be met to allow for this. # If it is allowed we are sure that nothing can be used as a sentinel -function check_mapping_allowed(short::AbstractVector, - refarray_long::AbstractVector, - refpool_long, invrefpool_long) - if isempty(short) || - isnothing(refpool_long) || - isnothing(invrefpool_long) || - eltype(refarray_long) isa Union{Signed, Unsigned} - return false - else - return true - end -end +check_mapping_allowed(short::AbstractVector, refarray_long::AbstractVector, + refpool_long, invrefpool_long) = + !isempty(short) && !isnothing(refpool_long) && !isnothing(invrefpool_long) && + eltype(refarray_long) <: Union{Signed, Unsigned} @noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector) = [@inbounds mapping[r] for r in refarray] @@ -118,7 +110,6 @@ function map2refs(x::PooledVector, invrefpool) mapping = [get(invrefpool, v, nothing) for v in x.pool] # use function barrier as mapping is type unstable return map_refarray(mapping, DataAPI.refarray(x)) - end function compose_inner_table(joiner::DataFrameJoiner, From c05410b0bc5d26224d50de684d63dafd9bd598b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 4 Feb 2021 12:29:52 +0100 Subject: [PATCH 18/59] remove PooledArrays.jl specific code --- src/abstractdataframe/join.jl | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 031b307f28..080954de25 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -100,16 +100,33 @@ check_mapping_allowed(short::AbstractVector, refarray_long::AbstractVector, !isempty(short) && !isnothing(refpool_long) && !isnothing(invrefpool_long) && eltype(refarray_long) <: Union{Signed, Unsigned} -@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector) = +@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector, ::Val{true}) = + [@inbounds mapping[r + 1] for r in refarray] + +@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector, ::Val{false}) = [@inbounds mapping[r] for r in refarray] -map2refs(x::AbstractVector, invrefpool) = [get(invrefpool, v, nothing) for v in x] +function map2refs(x::AbstractVector, invrefpool) + x_refpool = DataAPI.refpool(x) + x_refarray = DataAPI.refarray(x) + if isnothing(x_refpool) || !(x_refpool isa AbstractVector) || !(eltype(x_refarray) <: Integer) + return [get(invrefpool, v, nothing) for v in x] + else + # here we know that x_refpool is AbstractVector that allows integer indexing + # and its firstindex must be an integer + fi = firstindex(x_refpool) + # if fi is not 0 or 1 then we fallback to slow path for safety reasons + # all refpool we currently know have firstindex 0 or 1 + # if there is some very strange firstindex we might run into oveflow issues + if 0 <= fi <= 1 + mapping = [get(invrefpool, v, nothing) for v in x_refpool] + # use function barrier as mapping is type unstable + return map_refarray(mapping, x_refarray, Val(fi == 0)) + else + return [get(invrefpool, v, nothing) for v in x] + end -# this is PooledArrays.jl specific optimization as its pool is a 1-based vector -function map2refs(x::PooledVector, invrefpool) - mapping = [get(invrefpool, v, nothing) for v in x.pool] - # use function barrier as mapping is type unstable - return map_refarray(mapping, DataAPI.refarray(x)) + end end function compose_inner_table(joiner::DataFrameJoiner, From a8b2702dfa4770082fb00292cbeff0376aef0679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 4 Feb 2021 19:36:35 +0100 Subject: [PATCH 19/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 080954de25..ab54020715 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -109,7 +109,7 @@ check_mapping_allowed(short::AbstractVector, refarray_long::AbstractVector, function map2refs(x::AbstractVector, invrefpool) x_refpool = DataAPI.refpool(x) x_refarray = DataAPI.refarray(x) - if isnothing(x_refpool) || !(x_refpool isa AbstractVector) || !(eltype(x_refarray) <: Integer) + if x_refpool isa AbstractVector{<:Integer} && 0 <= firstindex(x_refpool) <= 1 return [get(invrefpool, v, nothing) for v in x] else # here we know that x_refpool is AbstractVector that allows integer indexing @@ -117,7 +117,7 @@ function map2refs(x::AbstractVector, invrefpool) fi = firstindex(x_refpool) # if fi is not 0 or 1 then we fallback to slow path for safety reasons # all refpool we currently know have firstindex 0 or 1 - # if there is some very strange firstindex we might run into oveflow issues + # if there is some very strange firstindex we might run into overflow issues if 0 <= fi <= 1 mapping = [get(invrefpool, v, nothing) for v in x_refpool] # use function barrier as mapping is type unstable From 5ec767fbf0db1bc8402c97fe7a00e4cf52149ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 4 Feb 2021 20:03:48 +0100 Subject: [PATCH 20/59] corrections after the review --- src/abstractdataframe/join.jl | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index ab54020715..3d616db704 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -108,24 +108,17 @@ check_mapping_allowed(short::AbstractVector, refarray_long::AbstractVector, function map2refs(x::AbstractVector, invrefpool) x_refpool = DataAPI.refpool(x) - x_refarray = DataAPI.refarray(x) if x_refpool isa AbstractVector{<:Integer} && 0 <= firstindex(x_refpool) <= 1 - return [get(invrefpool, v, nothing) for v in x] - else # here we know that x_refpool is AbstractVector that allows integer indexing # and its firstindex must be an integer - fi = firstindex(x_refpool) - # if fi is not 0 or 1 then we fallback to slow path for safety reasons + # if firstindex is not 0 or 1 then we fallback to slow path for safety reasons # all refpool we currently know have firstindex 0 or 1 # if there is some very strange firstindex we might run into overflow issues - if 0 <= fi <= 1 - mapping = [get(invrefpool, v, nothing) for v in x_refpool] - # use function barrier as mapping is type unstable - return map_refarray(mapping, x_refarray, Val(fi == 0)) - else - return [get(invrefpool, v, nothing) for v in x] - end - + # below use function barrier as mapping is not type stable + mapping = [get(invrefpool, v, nothing) for v in x_refpool] + return map_refarray(mapping, DataAPI.refarray(x), Val(firstindex(x_refpool) == 0)) + else + return [get(invrefpool, v, nothing) for v in x] end end From 1db13ef1be4d02d679403315835801cc165c05e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 4 Feb 2021 22:14:22 +0100 Subject: [PATCH 21/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 3d616db704..694480a776 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -100,11 +100,8 @@ check_mapping_allowed(short::AbstractVector, refarray_long::AbstractVector, !isempty(short) && !isnothing(refpool_long) && !isnothing(invrefpool_long) && eltype(refarray_long) <: Union{Signed, Unsigned} -@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector, ::Val{true}) = - [@inbounds mapping[r + 1] for r in refarray] - -@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector, ::Val{false}) = - [@inbounds mapping[r] for r in refarray] +@noinline map_refarray(mapping::AbstractVector, refarray::AbstractVector, ::Val{fi}) where {fi} = + [@inbounds mapping[r - fi + 1] for r in refarray] function map2refs(x::AbstractVector, invrefpool) x_refpool = DataAPI.refpool(x) @@ -116,7 +113,7 @@ function map2refs(x::AbstractVector, invrefpool) # if there is some very strange firstindex we might run into overflow issues # below use function barrier as mapping is not type stable mapping = [get(invrefpool, v, nothing) for v in x_refpool] - return map_refarray(mapping, DataAPI.refarray(x), Val(firstindex(x_refpool) == 0)) + return map_refarray(mapping, DataAPI.refarray(x), Val(Int(firstindex(x_refpool)))) else return [get(invrefpool, v, nothing) for v in x] end From 7f2d89745abdb271b3cfff06a594d060564d0754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 5 Feb 2021 14:21:22 +0100 Subject: [PATCH 22/59] add OnCol --- src/abstractdataframe/join.jl | 142 +++++++++++++++++++++++++++++++++- test/join.jl | 67 +++++++++++++++- 2 files changed, 206 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 694480a776..b4082dcef3 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -88,9 +88,149 @@ _rename_cols(old_names::AbstractVector{Symbol}, (renamecols isa Function ? Symbol(renamecols(string(n))) : Symbol(n, renamecols)) for n in old_names] +struct OnColRow{T} + row::Int + cols::T + + OnColRow(row::Union{Signed,Unsigned}, + cols::NTuple{N, AbstractVector}) where {N} = + new{typeof(cols)}(Int(row), cols) +end + +struct OnCol{T,N} <: AbstractVector{OnColRow{T}} + len::Int + cols::T + + function OnCol(cs::AbstractVector...) + @assert length(cs) > 1 + len = length(cs[1]) + @assert all(x -> firstindex(x) == 1, cs) + @assert all(x -> lastindex(x) == len, cs) + new{typeof(cs), length(cs)}(len, cs) + end +end + +Base.IndexStyle(::Type{<:OnCol}) = Base.IndexLinear() + +@inline Base.size(oc::OnCol) = (oc.len,) + +@inline function Base.getindex(oc::OnCol, i::Int) + @boundscheck checkbounds(oc, i) + return OnColRow(i, oc.cols) +end + +@inline function Base.hash(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, h::UInt) + r1 = ocr1.row + c11, c12 = ocr1.cols + return @inbounds hash(c11[r1], hash((c12[r1],), h)) +end + +@inline function Base.hash(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, h::UInt) + r1 = ocr1.row + c11, c12, c13 = ocr1.cols + return @inbounds hash(c11[r1], hash(c12[r1], hash((c13[r1],), h))) +end + +@inline function Base.hash(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, h::UInt) where {N} + r1 = ocr1.row + cols1 = ocr1.cols + @inbounds hv = hash((cols1[end][r1],), h) + for i in N-1:-1:1 + hv = @inbounds hash(cols1[i][r1], hv) + end + return return hv +end + +Base.:(==)(x::OnColRow, y::OnColRow) = MethodError(==, (x, y)) + +@inline function Base.isequal(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, ocr2::OnColRow{<:NTuple{2, AbstractVector}}) + r1 = ocr1.row + c11, c12 = ocr1.cols + r2 = ocr2.row + c21, c22 = ocr2.cols + + return @inbounds isequal(c11[r1], c21[r2]) && isequal(c12[r1], c22[r2]) +end + +@inline function Base.isequal(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, ocr2::OnColRow{<:NTuple{3,AbstractVector}}) + r1 = ocr1.row + c11, c12, c13 = ocr1.cols + r2 = ocr2.row + c21, c22, c23 = ocr2.cols + + return @inbounds isequal(c11[r1], c21[r2]) && + isequal(c12[r1], c22[r2]) && isequal(c13[r1], c23[r2]) +end + +@inline function Base.isequal(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {N} + r1 = ocr1.row + cols1 = ocr1.cols + r2 = ocr2.row + cols2 = ocr2.cols + + @inbounds for i in 1:N + isequal(cols1[i][r1], cols2[i][r2]) || return false + end + return true +end + +@inline function Base.isless(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, ocr2::OnColRow{<:NTuple{2, AbstractVector}}) + r1 = ocr1.row + c11, c12 = ocr1.cols + r2 = ocr2.row + c21, c22 = ocr2.cols + + c11r = @inbounds c11[r1] + c12r = @inbounds c12[r1] + c21r = @inbounds c21[r2] + c22r = @inbounds c22[r2] + + isless(c11r, c21r) && return true + isequal(c11r, c21r) || return false + return isless(c12r, c22r) +end + +@inline function Base.isless(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, ocr2::OnColRow{<:NTuple{3,AbstractVector}}) + r1 = ocr1.row + c11, c12, c13 = ocr1.cols + r2 = ocr2.row + c21, c22, c23 = ocr2.cols + + c11r = @inbounds c11[r1] + c12r = @inbounds c12[r1] + c13r = @inbounds c13[r1] + c21r = @inbounds c21[r2] + c22r = @inbounds c22[r2] + c23r = @inbounds c23[r2] + + isless(c11r, c21r) && return true + isequal(c11r, c21r) || return false + isless(c12r, c22r) && return true + isequal(c12r, c22r) || return false + return isless(c13r, c23r) +end + +@inline function Base.isless(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {T1, T2, N} + r1 = ocr1.row + cols1 = ocr1.cols + r2 = ocr2.row + cols2 = ocr2.cols + + lastcols1 = @inbounds cols1[1][r1] + lastcols2 = @inbounds cols2[1][r2] + isless(lastcols1, lastcols2) && return true + @inbounds for i in 2:N + isequal(lastcols1, lastcols2) || return false + lastcols1 = cols1[i][r1] + lastcols2 = cols2[i][r2] + isless(lastcols1, lastcols2) && return true + end + return false +end + prepare_on_col() = throw(ArgumentError("at least one on column required when joining")) prepare_on_col(c::AbstractVector) = c -prepare_on_col(cs::AbstractVector...) = tuple.(cs...) +prepare_on_col(cs::AbstractVector...) = OnCol(cs...) # Return if it is allowed to use refpool instead of the original array for joining. # There are multiple conditions that must be met to allow for this. diff --git a/test/join.jl b/test/join.jl index d0f9222741..9bae032835 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1,7 +1,7 @@ module TestJoin using Test, DataFrames, Random, CategoricalArrays, PooledArrays -using DataFrames: similar_missing +using DataFrames: similar_missing, OnCol const ≅ = isequal name = DataFrame(ID = Union{Int, Missing}[1, 2, 3], @@ -936,6 +936,67 @@ end innerjoin(df1_view2, df2, on=:a) end +@testset "OnCol correcntess tests" begin + Random.seed!(1234) + c1 = collect(1:10^2) + c2 = collect(Float64, 1:10^2) + c3 = collect(sort(string.(1:10^2))) + c4 = repeat(1:10, inner=10) + c5 = collect(Float64, repeat(1:50, inner=2)) + c6 = sort(string.(repeat(1:25,inner=4))) + c7 = repeat(20:-1:1, inner=5) + + @test_throws AssertionError OnCol() + @test_throws AssertionError OnCol(c1) + @test_throws AssertionError OnCol(c1, [1]) + @test_throws MethodError OnCol(c1, 1) + + oncols = [OnCol(c1, c2), OnCol(c3, c4), OnCol(c5, c6), OnCol(c1, c2, c3), + OnCol(c2, c3, c4), OnCol(c4, c5, c6), OnCol(c1, c2, c3, c4), + OnCol(c2, c3, c4, c5), OnCol(c3, c4, c5, c6), OnCol(c1, c2, c3, c4, c5), + OnCol(c2, c3, c4, c5, c6), OnCol(c1, c2, c3, c4, c5, c6), + OnCol(c4, c7), OnCol(c4, c5, c7), OnCol(c4, c5, c6, c7)] + tupcols = [tuple.(c1, c2), tuple.(c3, c4), tuple.(c5, c6), tuple.(c1, c2, c3), + tuple.(c2, c3, c4), tuple.(c4, c5, c6), tuple.(c1, c2, c3, c4), + tuple.(c2, c3, c4, c5), tuple.(c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5), + tuple.(c2, c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5, c6), + tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] + + for (oncol, tupcol) in zip(oncols, tupcols) + @test issorted(oncol) == issorted(tupcol) + end + + for i in eachindex(c1), j in eachindex(oncols, tupcols) + @test hash(oncols[j][i]) == hash(tupcols[j][i]) + @test hash(oncols[j][i], UInt(10)) == hash(tupcols[j][i], UInt(10)) + for k in eachindex(c1) + @test isequal(oncols[j][i], oncols[j][k]) == isequal(tupcols[j][i], tupcols[j][k]) + @test isequal(oncols[j][k], oncols[j][i]) == isequal(tupcols[j][k], tupcols[j][i]) + @test isless(oncols[j][i], oncols[j][k]) == isless(tupcols[j][i], tupcols[j][k]) + @test isless(oncols[j][k], oncols[j][i]) == isless(tupcols[j][k], tupcols[j][i]) + end + end + + foreach(shuffle!, [c1, c2, c3, c4, c5, c6]) + + tupcols = [tuple.(c1, c2), tuple.(c3, c4), tuple.(c5, c6), tuple.(c1, c2, c3), + tuple.(c2, c3, c4), tuple.(c4, c5, c6), tuple.(c1, c2, c3, c4), + tuple.(c2, c3, c4, c5), tuple.(c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5), + tuple.(c2, c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5, c6), + tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] + + for i in eachindex(c1), j in eachindex(oncols, tupcols) + @test hash(oncols[j][i]) == hash(tupcols[j][i]) + @test hash(oncols[j][i], UInt(10)) == hash(tupcols[j][i], UInt(10)) + for k in eachindex(c1) + @test isequal(oncols[j][i], oncols[j][k]) == isequal(tupcols[j][i], tupcols[j][k]) + @test isequal(oncols[j][k], oncols[j][i]) == isequal(tupcols[j][k], tupcols[j][i]) + @test isless(oncols[j][i], oncols[j][k]) == isless(tupcols[j][i], tupcols[j][k]) + @test isless(oncols[j][k], oncols[j][i]) == isless(tupcols[j][k], tupcols[j][i]) + end + end +end + @testset "innerjoin correctness tests" begin function test_innerjoin(df1, df2) @assert names(df1) == ["id", "x"] @@ -962,7 +1023,7 @@ end end Random.seed!(1234) - for i in 1:10, j in 0:3 + for i in 1:5, j in 0:2 for df1 in [DataFrame(id=rand(1:i+j, i+j), x=1:i+j), DataFrame(id=rand(1:i, i), x=1:i), DataFrame(id=[rand(1:i+j, i+j); missing], x=1:i+j+1), DataFrame(id=[rand(1:i, i); missing], x=1:i+1)], @@ -991,6 +1052,8 @@ end @test test_innerjoin(df1c, df2c) @test test_innerjoin(df1c, df2) @test test_innerjoin(df1c, rename(df1c, :x => :y)) + @test test_innerjoin(df1p, df2c) + @test test_innerjoin(df1c, df2p) end end end From 9208bffd19072ea2f462488d5d4d3e58679325cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 5 Feb 2021 22:03:48 +0100 Subject: [PATCH 23/59] add faster processing of integer columns --- src/abstractdataframe/join.jl | 143 +++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index b4082dcef3..d0be407b7a 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -252,10 +252,10 @@ function map2refs(x::AbstractVector, invrefpool) # all refpool we currently know have firstindex 0 or 1 # if there is some very strange firstindex we might run into overflow issues # below use function barrier as mapping is not type stable - mapping = [get(invrefpool, v, nothing) for v in x_refpool] + mapping = [get(invrefpool, v, missing) for v in x_refpool] return map_refarray(mapping, DataAPI.refarray(x), Val(Int(firstindex(x_refpool)))) else - return [get(invrefpool, v, nothing) for v in x] + return [get(invrefpool, v, missing) for v in x] end end @@ -358,9 +358,19 @@ function compose_inner_table(joiner::DataFrameJoiner, end if !already_joined if right_shorter - left_ixs, right_ixs = _innerjoin_unsorted(left_col, right_col) + if left_col isa AbstractVector{<:Union{Integer, Missing}} && + right_col isa AbstractVector{<:Union{Integer, Missing}} + left_ixs, right_ixs = _innerjoin_unsorted_int(left_col, right_col) + else + left_ixs, right_ixs = _innerjoin_unsorted(left_col, right_col) + end else - right_ixs, left_ixs = _innerjoin_unsorted(right_col, left_col) + if left_col isa AbstractVector{<:Union{Integer, Missing}} && + right_col isa AbstractVector{<:Union{Integer, Missing}} + right_ixs, left_ixs = _innerjoin_unsorted_int(right_col, left_col) + else + right_ixs, left_ixs = _innerjoin_unsorted(right_col, left_col) + end end end end @@ -479,6 +489,62 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where return left_ixs, right_ixs end +extrema_missing(x::AbstractVector{Missing}) = 1, 0 + +function extrema_missing(x::AbstractVector{T}) where {T<:Union{Integer, Missing}} + try + return extrema(skipmissing(x)) + catch + S = nonmissingtype(T) + return S(1), S(0) + end +end + +function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}}, + right::AbstractVector{<:Union{Integer, Missing}}) + minv, maxv = extrema_missing(right) + + if (maxv - minv) > 128 && (maxv - minv) ÷ 2 > length(right) && + (minv < typemin(Int) + 2 || maxv > typemax(Int) - 3) + return _innerjoin_unsorted(left, right) + end + + offset = 1 - Int(minv) # we are now sure it does not overflow + len = Int(maxv) - Int(minv) + 2 + dict = zeros(Int, len) + + @inbounds for (idx_r, val_r) in enumerate(right) + i = ismissing(val_r) ? length(dict) : Int(val_r) + offset + if dict[i] > 0 + return _innerjoin_dup_int(left, right, dict, idx_r, offset, Int(minv), Int(maxv)) + end + dict[i] = idx_r + end + + left_ixs = Int[] + right_ixs = Int[] + + @inbounds for (idx_l, val_l) in enumerate(left) + # we use dict_index to make sure the following two operations are fast: + # - if index is found - get it and process it + # - if index is not found - do nothing + if ismissing(val_l) + idx_r = dict[end] + if idx_r > 0 + push!(left_ixs, idx_l) + push!(right_ixs, idx_r) + end + elseif (minv <= val_l <= maxv) + idx_r = dict[Int(val_l) + offset] + if idx_r > 0 + push!(left_ixs, idx_l) + push!(right_ixs, idx_r) + end + end + end + return left_ixs, right_ixs +end + # we fall back to general case if we have duplicates # normally it should happen fast as we reuse work already done function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict{T, Int}, idx_r_start::Int) where {T} @@ -506,6 +572,31 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict return _innerjoin_postprocess(left, dict, groups, ngroups, right_len) end +function _innerjoin_dup_int(left::AbstractVector{<:Union{Integer, Missing}}, + right::AbstractVector{<:Union{Integer, Missing}}, + dict::Vector{Int}, idx_r_start::Int, offset::Int, minv::Int, maxv::Int) + ngroups = idx_r_start - 1 + right_len = length(right) + groups = Vector{Int}(undef, right_len) + groups[1:ngroups] = 1:ngroups + + @inbounds for idx_r in idx_r_start:right_len + val_r = right[idx_r] + i = ismissing(val_r) ? length(dict) : Int(val_r) + offset + dict_val = dict[i] + if dict_val > 0 + groups[idx_r] = dict_val + else + ngroups += 1 + groups[idx_r] = ngroups + dict[i] = ngroups + end + end + + @assert ngroups > 0 # we should not get here with 0-length right + return _innerjoin_postprocess_int(left, dict, groups, ngroups, right_len, offset, minv, maxv) +end + function compute_join_indices!(groups::Vector{Int}, ngroups::Int, starts::Vector, rperm::Vector) @inbounds for gix in groups @@ -562,6 +653,50 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, return left_ixs, right_ixs end +function _innerjoin_postprocess_int(left::AbstractVector{<:Union{Integer, Missing}}, + dict::Vector{Int}, + groups::Vector{Int}, ngroups::Int, right_len::Int, + offset::Int, minv::Int, maxv::Int) + starts = zeros(Int, ngroups) + rperm = Vector{Int}(undef, right_len) + + left_ixs = Int[] + right_ixs = Int[] + + sizehint!(left_ixs, right_len) + sizehint!(right_ixs, right_len) + + compute_join_indices!(groups, ngroups, starts, rperm) + + n = 0 + @inbounds for (idx_l, val_l) in enumerate(left) + if ismissing(val_l) + group_id = dict[end] + elseif (minv <= val_l <= maxv) + group_id = dict[Int(val_l) + offset] + else + group_id = 0 + end + + if group_id > 0 + ref_stop = starts[group_id + 1] + l = ref_stop - starts[group_id] + newn = n + l + resize!(left_ixs, newn) + for i in n+1:n+l + left_ixs[i] = idx_l + end + resize!(right_ixs, newn) + for i in 1:l + right_ixs[n + i] = rperm[ref_stop - i + 1] + end + n = newn + end + end + + return left_ixs, right_ixs +end + function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap, From 1735712eb02206b74a23b67d37b381b12c0e2198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 00:58:11 +0100 Subject: [PATCH 24/59] Apply suggestions from code review --- src/abstractdataframe/join.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index d0be407b7a..a16fd37d88 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -234,7 +234,7 @@ prepare_on_col(cs::AbstractVector...) = OnCol(cs...) # Return if it is allowed to use refpool instead of the original array for joining. # There are multiple conditions that must be met to allow for this. -# If it is allowed we are sure that nothing can be used as a sentinel +# If it is allowed we are sure that missing can be used as a sentinel check_mapping_allowed(short::AbstractVector, refarray_long::AbstractVector, refpool_long, invrefpool_long) = !isempty(short) && !isnothing(refpool_long) && !isnothing(invrefpool_long) && @@ -504,8 +504,9 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} right::AbstractVector{<:Union{Integer, Missing}}) minv, maxv = extrema_missing(right) - if (maxv - minv) > 128 && (maxv - minv) ÷ 2 > length(right) && - (minv < typemin(Int) + 2 || maxv > typemax(Int) - 3) + val_range = big(maxv) - big(minv) + if (val_range > 128 && val_range ÷ 2 > length(right)) || + minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _innerjoin_unsorted(left, right) end From bb7e8f1e6ac995dac2dd6948142bf3f5a32f91e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 01:14:05 +0100 Subject: [PATCH 25/59] minor changes --- src/abstractdataframe/join.jl | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index a16fd37d88..c30f7a6da6 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -119,6 +119,9 @@ Base.IndexStyle(::Type{<:OnCol}) = Base.IndexLinear() return OnColRow(i, oc.cols) end +# TODO: rewrite hash, isequal and isless to use @generated +# or some other approach that would keep them efficient and avoid code duplication + @inline function Base.hash(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, h::UInt) r1 = ocr1.row c11, c12 = ocr1.cols @@ -143,7 +146,8 @@ end Base.:(==)(x::OnColRow, y::OnColRow) = MethodError(==, (x, y)) -@inline function Base.isequal(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, ocr2::OnColRow{<:NTuple{2, AbstractVector}}) +@inline function Base.isequal(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, + ocr2::OnColRow{<:NTuple{2, AbstractVector}}) r1 = ocr1.row c11, c12 = ocr1.cols r2 = ocr2.row @@ -152,7 +156,8 @@ Base.:(==)(x::OnColRow, y::OnColRow) = MethodError(==, (x, y)) return @inbounds isequal(c11[r1], c21[r2]) && isequal(c12[r1], c22[r2]) end -@inline function Base.isequal(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, ocr2::OnColRow{<:NTuple{3,AbstractVector}}) +@inline function Base.isequal(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, + ocr2::OnColRow{<:NTuple{3,AbstractVector}}) r1 = ocr1.row c11, c12, c13 = ocr1.cols r2 = ocr2.row @@ -162,7 +167,8 @@ end isequal(c12[r1], c22[r2]) && isequal(c13[r1], c23[r2]) end -@inline function Base.isequal(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {N} +@inline function Base.isequal(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, + ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {N} r1 = ocr1.row cols1 = ocr1.cols r2 = ocr2.row @@ -174,7 +180,8 @@ end return true end -@inline function Base.isless(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, ocr2::OnColRow{<:NTuple{2, AbstractVector}}) +@inline function Base.isless(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, + ocr2::OnColRow{<:NTuple{2, AbstractVector}}) r1 = ocr1.row c11, c12 = ocr1.cols r2 = ocr2.row @@ -190,7 +197,8 @@ end return isless(c12r, c22r) end -@inline function Base.isless(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, ocr2::OnColRow{<:NTuple{3,AbstractVector}}) +@inline function Base.isless(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, + ocr2::OnColRow{<:NTuple{3,AbstractVector}}) r1 = ocr1.row c11, c12, c13 = ocr1.cols r2 = ocr2.row @@ -210,7 +218,8 @@ end return isless(c13r, c23r) end -@inline function Base.isless(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {T1, T2, N} +@inline function Base.isless(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, + ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {T1, T2, N} r1 = ocr1.row cols1 = ocr1.cols r2 = ocr2.row @@ -505,7 +514,7 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} minv, maxv = extrema_missing(right) val_range = big(maxv) - big(minv) - if (val_range > 128 && val_range ÷ 2 > length(right)) || + if val_range ÷ 2 > max(64, length(right)) || minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _innerjoin_unsorted(left, right) end @@ -548,7 +557,8 @@ end # we fall back to general case if we have duplicates # normally it should happen fast as we reuse work already done -function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, dict::Dict{T, Int}, idx_r_start::Int) where {T} +function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, + dict::Dict{T, Int}, idx_r_start::Int) where {T} ngroups = idx_r_start - 1 right_len = length(right) groups = Vector{Int}(undef, right_len) @@ -575,7 +585,8 @@ end function _innerjoin_dup_int(left::AbstractVector{<:Union{Integer, Missing}}, right::AbstractVector{<:Union{Integer, Missing}}, - dict::Vector{Int}, idx_r_start::Int, offset::Int, minv::Int, maxv::Int) + dict::Vector{Int}, idx_r_start::Int, offset::Int, + minv::Int, maxv::Int) ngroups = idx_r_start - 1 right_len = length(right) groups = Vector{Int}(undef, right_len) From 6d46f1b434f3924de3511c80f3b29bf69595de7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 10:30:02 +0100 Subject: [PATCH 26/59] fix test coverage --- test/join.jl | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/test/join.jl b/test/join.jl index 9bae032835..eeedb92d85 100644 --- a/test/join.jl +++ b/test/join.jl @@ -956,14 +956,16 @@ end OnCol(c2, c3, c4, c5), OnCol(c3, c4, c5, c6), OnCol(c1, c2, c3, c4, c5), OnCol(c2, c3, c4, c5, c6), OnCol(c1, c2, c3, c4, c5, c6), OnCol(c4, c7), OnCol(c4, c5, c7), OnCol(c4, c5, c6, c7)] - tupcols = [tuple.(c1, c2), tuple.(c3, c4), tuple.(c5, c6), tuple.(c1, c2, c3), - tuple.(c2, c3, c4), tuple.(c4, c5, c6), tuple.(c1, c2, c3, c4), - tuple.(c2, c3, c4, c5), tuple.(c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5), - tuple.(c2, c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5, c6), - tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] + tupcols = [tuple.(c1, c2), tuple.(c3, c4), tuple.(c5, c6), tuple.(c1, c2, c3), + tuple.(c2, c3, c4), tuple.(c4, c5, c6), tuple.(c1, c2, c3, c4), + tuple.(c2, c3, c4, c5), tuple.(c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5), + tuple.(c2, c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5, c6), + tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] for (oncol, tupcol) in zip(oncols, tupcols) @test issorted(oncol) == issorted(tupcol) + @test IndexStyle(oncol) === IndexLinear() + @test_throws MethodError oncol[1] == oncol[2] end for i in eachindex(c1), j in eachindex(oncols, tupcols) @@ -980,10 +982,10 @@ end foreach(shuffle!, [c1, c2, c3, c4, c5, c6]) tupcols = [tuple.(c1, c2), tuple.(c3, c4), tuple.(c5, c6), tuple.(c1, c2, c3), - tuple.(c2, c3, c4), tuple.(c4, c5, c6), tuple.(c1, c2, c3, c4), - tuple.(c2, c3, c4, c5), tuple.(c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5), - tuple.(c2, c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5, c6), - tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] + tuple.(c2, c3, c4), tuple.(c4, c5, c6), tuple.(c1, c2, c3, c4), + tuple.(c2, c3, c4, c5), tuple.(c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5), + tuple.(c2, c3, c4, c5, c6), tuple.(c1, c2, c3, c4, c5, c6), + tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] for i in eachindex(c1), j in eachindex(oncols, tupcols) @test hash(oncols[j][i]) == hash(tupcols[j][i]) @@ -998,6 +1000,9 @@ end end @testset "innerjoin correctness tests" begin + + @test_throws ArgumentError DataFrames.prepare_on_col() + function test_innerjoin(df1, df2) @assert names(df1) == ["id", "x"] @assert names(df2) == ["id", "y"] @@ -1058,11 +1063,32 @@ end end end + # some special cases @test innerjoin(DataFrame(id=[]), DataFrame(id=[]), on=:id) == DataFrame(id=[]) @test innerjoin(DataFrame(id=[]), DataFrame(id=[1, 2, 3]), on=:id) == DataFrame(id=[]) @test innerjoin(DataFrame(id=[1, 2, 3]), DataFrame(id=[]), on=:id) == DataFrame(id=[]) @test innerjoin(DataFrame(id=[4, 5, 6]), DataFrame(id=[1, 2, 3]), on=:id) == DataFrame(id=[]) @test innerjoin(DataFrame(id=[1, 2, 3]), DataFrame(id=[4, 5, 6]), on=:id) == DataFrame(id=[]) + + @test innerjoin(DataFrame(id=[missing]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == + DataFrame(id=[]) + @test innerjoin(DataFrame(id=Missing[]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == + DataFrame(id=[]) + @test innerjoin(DataFrame(id=Union{Int, Missing}[]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == + DataFrame(id=[]) + @test innerjoin(DataFrame(id=Union{Int, Missing}[missing]), DataFrame(id=[1]), + on=:id, matchmissing=:equal) == DataFrame(id=[]) + @test innerjoin(DataFrame(id=[missing]), DataFrame(id=[1, missing]), + on=:id, matchmissing=:equal) ≅ DataFrame(id=[missing]) + @test innerjoin(DataFrame(id=Union{Int, Missing}[missing]), DataFrame(id=[1, missing]), + on=:id, matchmissing=:equal) ≅ DataFrame(id=[missing]) + + @test innerjoin(DataFrame(id=[typemin(Int) + 1, typemin(Int)]), DataFrame(id=[typemin(Int)]), on=:id) == + DataFrame(id=[typemin(Int)]) + @test innerjoin(DataFrame(id=[typemax(Int), typemax(Int) - 1]), DataFrame(id=[typemax(Int)]), on=:id) == + DataFrame(id=[typemax(Int)]) + @test innerjoin(DataFrame(id=[2000, 2, 100]), DataFrame(id=[2000, 1, 100]), on=:id) == + DataFrame(id=[2000, 100]) end end # module From 1ef1362b67ef3f37d6ee23968ed4f55332f2c838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 10:30:29 +0100 Subject: [PATCH 27/59] fix test coverage --- src/abstractdataframe/join.jl | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index c30f7a6da6..b2e8dd9b3f 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -342,15 +342,12 @@ function compose_inner_table(joiner::DataFrameJoiner, left_col = prepare_on_col(left_cols...) right_col = prepare_on_col(right_cols...) - local left_ixs - local right_ixs - local already_joined + left_ixs = Int[] + right_ixs = Int[] + already_joined = false - if isempty(left_col) || isempty(right_col) - # we treat this case separately so we know we have at least one element later - left_ixs, right_ixs = Int[], Int[] - else - already_joined = false + # here we know that we have at least one element later + if !(isempty(left_col) || isempty(right_col)) # if sorting is not disallowed try using a fast algorithm that works # on sorted columns; if it is not run or errors fall back to the unsorted case # the try-catch is used to handle the case when columns on which we join From eb507563e81fee811e0d914802ded7e91dd85d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 10:31:57 +0100 Subject: [PATCH 28/59] revert change for better clarity --- src/abstractdataframe/join.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index b2e8dd9b3f..b10e4e4333 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -342,12 +342,14 @@ function compose_inner_table(joiner::DataFrameJoiner, left_col = prepare_on_col(left_cols...) right_col = prepare_on_col(right_cols...) - left_ixs = Int[] - right_ixs = Int[] + local left_ixs + local right_ixs already_joined = false - # here we know that we have at least one element later - if !(isempty(left_col) || isempty(right_col)) + if isempty(left_col) || isempty(right_col) + # we treat this case separately so we know we have at least one element later + left_ixs, right_ixs = Int[], Int[] + else # if sorting is not disallowed try using a fast algorithm that works # on sorted columns; if it is not run or errors fall back to the unsorted case # the try-catch is used to handle the case when columns on which we join From 7cfb5b47078a69589e5a69486ef84d32398e24eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 10:32:53 +0100 Subject: [PATCH 29/59] another small fix --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index b10e4e4333..0cad8e1f8c 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -344,7 +344,6 @@ function compose_inner_table(joiner::DataFrameJoiner, local left_ixs local right_ixs - already_joined = false if isempty(left_col) || isempty(right_col) # we treat this case separately so we know we have at least one element later @@ -354,6 +353,7 @@ function compose_inner_table(joiner::DataFrameJoiner, # on sorted columns; if it is not run or errors fall back to the unsorted case # the try-catch is used to handle the case when columns on which we join # contain values that are not comparable + already_joined = false if !disallow_sorted try if issorted(left_col) && issorted(right_col) From d9dd15fc61cdd6ff8d043f77fc2bda179f35da01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 10:56:50 +0100 Subject: [PATCH 30/59] fix method definition --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 0cad8e1f8c..67baf45f1e 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -144,7 +144,7 @@ end return return hv end -Base.:(==)(x::OnColRow, y::OnColRow) = MethodError(==, (x, y)) +Base.:(==)(x::OnColRow, y::OnColRow) = throw(MethodError(==, (x, y))) @inline function Base.isequal(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, ocr2::OnColRow{<:NTuple{2, AbstractVector}}) From fd035876b70b6fb4b17f83243cd03ecec974895d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 13:04:12 +0100 Subject: [PATCH 31/59] Apply suggestions from code review --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 67baf45f1e..b4cab1db1a 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -513,7 +513,7 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} minv, maxv = extrema_missing(right) val_range = big(maxv) - big(minv) - if val_range ÷ 2 > max(64, length(right)) || + if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) || minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _innerjoin_unsorted(left, right) end From e30f51abaf120b5f4156914a2b5ca03b4f5f0981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 13:37:50 +0100 Subject: [PATCH 32/59] change hash implementation --- src/abstractdataframe/join.jl | 45 ++++++++++++++++------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index b4cab1db1a..b7f047605f 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -91,22 +91,24 @@ _rename_cols(old_names::AbstractVector{Symbol}, struct OnColRow{T} row::Int cols::T + h::Vector{UInt} OnColRow(row::Union{Signed,Unsigned}, - cols::NTuple{N, AbstractVector}) where {N} = - new{typeof(cols)}(Int(row), cols) + cols::NTuple{N, AbstractVector}, h::Vector{UInt}) where {N} = + new{typeof(cols)}(Int(row), cols, h) end struct OnCol{T,N} <: AbstractVector{OnColRow{T}} len::Int cols::T + h::Vector{UInt} function OnCol(cs::AbstractVector...) @assert length(cs) > 1 len = length(cs[1]) @assert all(x -> firstindex(x) == 1, cs) @assert all(x -> lastindex(x) == len, cs) - new{typeof(cs), length(cs)}(len, cs) + new{typeof(cs), length(cs)}(len, cs, UInt[]) end end @@ -116,34 +118,24 @@ Base.IndexStyle(::Type{<:OnCol}) = Base.IndexLinear() @inline function Base.getindex(oc::OnCol, i::Int) @boundscheck checkbounds(oc, i) - return OnColRow(i, oc.cols) + return OnColRow(i, oc.cols, oc.h) end -# TODO: rewrite hash, isequal and isless to use @generated -# or some other approach that would keep them efficient and avoid code duplication - -@inline function Base.hash(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, h::UInt) - r1 = ocr1.row - c11, c12 = ocr1.cols - return @inbounds hash(c11[r1], hash((c12[r1],), h)) -end - -@inline function Base.hash(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, h::UInt) - r1 = ocr1.row - c11, c12, c13 = ocr1.cols - return @inbounds hash(c11[r1], hash(c12[r1], hash((c13[r1],), h))) -end +Base.hash(ocr1::OnColRow, h::UInt) = throw(MethodError(hash, (ocr1, h))) +@inline Base.hash(ocr1::OnColRow) = @inbounds ocr1.h[ocr1.row] -@inline function Base.hash(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, h::UInt) where {N} - r1 = ocr1.row - cols1 = ocr1.cols - @inbounds hv = hash((cols1[end][r1],), h) - for i in N-1:-1:1 - hv = @inbounds hash(cols1[i][r1], hv) +function _prehash(oc::OnCol) + h = oc.h + resize!(h, oc.len) + fill!(h, Base.tuplehash_seed) + for col in reverse(oc.cols) + h .= hash.(col, h) end - return return hv end +# TODO: rewrite isequal and isless to use @generated +# or some other approach that would keep them efficient and avoid code duplication + Base.:(==)(x::OnColRow, y::OnColRow) = throw(MethodError(==, (x, y))) @inline function Base.isequal(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, @@ -471,6 +463,9 @@ end function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where {T} dict = Dict{T, Int}() + right <: OnCol && _prehash(right) + left <: OnCol && _prehash(left) + for (idx_r, val_r) in enumerate(right) # we use dict_index to make sure the following two operations are fast: # - if index is found - fall back to algorithm allowing duplicates From 6aa95e3a9f2cc4b56372ef0a14777b980c89f178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 13:45:51 +0100 Subject: [PATCH 33/59] fix typo --- src/abstractdataframe/join.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index b7f047605f..a877cc88eb 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -463,8 +463,8 @@ end function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where {T} dict = Dict{T, Int}() - right <: OnCol && _prehash(right) - left <: OnCol && _prehash(left) + right isa OnCol && _prehash(right) + left isa OnCol && _prehash(left) for (idx_r, val_r) in enumerate(right) # we use dict_index to make sure the following two operations are fast: From bdcaeef696cf1db08846053c4ef5d48cd8de2d30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 13:51:35 +0100 Subject: [PATCH 34/59] fix tests --- test/join.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/join.jl b/test/join.jl index eeedb92d85..643f0fc397 100644 --- a/test/join.jl +++ b/test/join.jl @@ -969,8 +969,8 @@ end end for i in eachindex(c1), j in eachindex(oncols, tupcols) + DataFrames._prehash(oncols[j]) @test hash(oncols[j][i]) == hash(tupcols[j][i]) - @test hash(oncols[j][i], UInt(10)) == hash(tupcols[j][i], UInt(10)) for k in eachindex(c1) @test isequal(oncols[j][i], oncols[j][k]) == isequal(tupcols[j][i], tupcols[j][k]) @test isequal(oncols[j][k], oncols[j][i]) == isequal(tupcols[j][k], tupcols[j][i]) @@ -988,8 +988,8 @@ end tuple.(c4, c7), tuple.(c4, c5, c7), tuple.(c4, c5, c6, c7)] for i in eachindex(c1), j in eachindex(oncols, tupcols) + DataFrames._prehash(oncols[j]) @test hash(oncols[j][i]) == hash(tupcols[j][i]) - @test hash(oncols[j][i], UInt(10)) == hash(tupcols[j][i], UInt(10)) for k in eachindex(c1) @test isequal(oncols[j][i], oncols[j][k]) == isequal(tupcols[j][i], tupcols[j][k]) @test isequal(oncols[j][k], oncols[j][i]) == isequal(tupcols[j][k], tupcols[j][i]) From 1150126098d685621c2f8ba31d64065ea9c96b09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 18:53:03 +0100 Subject: [PATCH 35/59] consistent detection of CategoricalArrays.jl types --- src/abstractdataframe/join.jl | 4 ++-- src/abstractdataframe/show.jl | 4 +--- src/groupeddataframe/fastaggregates.jl | 10 ++++------ 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index a877cc88eb..457d168071 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -311,9 +311,9 @@ function compose_inner_table(joiner::DataFrameJoiner, @assert length(lc) == left_len @assert length(rc) == right_len lct = typeof(lc) + lcat = nameof(lct) === :CategoricalArray && nameof(parentmodule(lct)) === :CategoricalArrays rct = typeof(rc) - rcat = rct.name === :CategoricalArray && nameof(rct.module) === :CategoricalArrays - lcat = lct.name === :CategoricalArray && nameof(lct.module) === :CategoricalArrays + rcat = nameof(rct) === :CategoricalArray && nameof(parentmodule(rct)) === :CategoricalArrays disallow_sorted |= rcat ⊻ lcat end diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index febb4853b0..536b47a664 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -91,9 +91,7 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true) maxwidth -= 1 # we will add "…" at the end - # This is only type display shortening so we - # are OK with any T whose name starts with CategoricalValue here - if startswith(sT, "CategoricalValue") || startswith(sT, "CategoricalArrays.CategoricalValue") + if nameof(T) === :CategoricalValue && nameof(parentmodule(T)) === :CategoricalArrays sT = string(nameof(T)) if textwidth(sT) ≤ maxwidth return sT * "…" * suffix diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index 039b8e9f49..94077bef57 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -122,9 +122,8 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax)) # !ismissing check is purely an optimization to avoid a copy later outcol = similar(incol, condf === !ismissing ? S : T, length(gd)) # Comparison is possible only between CatValues from the same pool - outcolT = typeof(outcol).name - if outcolT.name === :CategoricalArray && - nameof(outcolT.module) === :CategoricalArrays + resT = typeof(outcol).name + if nameof(resT) === :CategoricalArray && nameof(parentmodule(resT)) === :CategoricalArrays # we know that CategoricalArray has `pool` field outcol.pool = incol.pool end @@ -214,9 +213,8 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo end # Reallocate Vector created in groupreduce_init with min or max # for CategoricalVector - resT = typeof(res).name - if resT.name === :CategoricalArray && - nameof(resT.module) === :CategoricalArrays + resT = typeof(res) + if nameof(resT) === :CategoricalArray && nameof(parentmodule(resT)) === :CategoricalArrays @assert op === min || op === max # we know that CategoricalArray has `pool` field @assert res.pool === incol.pool From 0c1e8b6359562fef8dac97aaa81f25b1871c9303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 18:59:19 +0100 Subject: [PATCH 36/59] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index a877cc88eb..9f2c3110bd 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -94,7 +94,7 @@ struct OnColRow{T} h::Vector{UInt} OnColRow(row::Union{Signed,Unsigned}, - cols::NTuple{N, AbstractVector}, h::Vector{UInt}) where {N} = + cols::NTuple{<:Any, AbstractVector}, h::Vector{UInt}) = new{typeof(cols)}(Int(row), cols, h) end @@ -124,6 +124,7 @@ end Base.hash(ocr1::OnColRow, h::UInt) = throw(MethodError(hash, (ocr1, h))) @inline Base.hash(ocr1::OnColRow) = @inbounds ocr1.h[ocr1.row] +# Hashing one column at a time is faster since it can use SIMD function _prehash(oc::OnCol) h = oc.h resize!(h, oc.len) @@ -492,7 +493,7 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where return left_ixs, right_ixs end -extrema_missing(x::AbstractVector{Missing}) = 1, 0 +extrema_missing(x::AbstractVector{Missing}) = (1, 0) function extrema_missing(x::AbstractVector{T}) where {T<:Union{Integer, Missing}} try @@ -538,7 +539,7 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} push!(left_ixs, idx_l) push!(right_ixs, idx_r) end - elseif (minv <= val_l <= maxv) + elseif minv <= val_l <= maxv idx_r = dict[Int(val_l) + offset] if idx_r > 0 push!(left_ixs, idx_l) @@ -678,7 +679,7 @@ function _innerjoin_postprocess_int(left::AbstractVector{<:Union{Integer, Missin @inbounds for (idx_l, val_l) in enumerate(left) if ismissing(val_l) group_id = dict[end] - elseif (minv <= val_l <= maxv) + elseif minv <= val_l <= maxv group_id = dict[Int(val_l) + offset] else group_id = 0 From 558129de36b7f3a0e34d708d3d0d9c72b564d2e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 20:21:21 +0100 Subject: [PATCH 37/59] add hash test --- test/join.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/join.jl b/test/join.jl index 643f0fc397..266d5bf2b7 100644 --- a/test/join.jl +++ b/test/join.jl @@ -969,6 +969,7 @@ end end for i in eachindex(c1), j in eachindex(oncols, tupcols) + @test_throws MethodError hash(oncols[j][1], zero(UInt)) DataFrames._prehash(oncols[j]) @test hash(oncols[j][i]) == hash(tupcols[j][i]) for k in eachindex(c1) From cbe214e11e3fd170b1e99cfe41db51714d0bad77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 20:58:41 +0100 Subject: [PATCH 38/59] in printing we might have union --- src/abstractdataframe/show.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index 536b47a664..febb4853b0 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -91,7 +91,9 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true) maxwidth -= 1 # we will add "…" at the end - if nameof(T) === :CategoricalValue && nameof(parentmodule(T)) === :CategoricalArrays + # This is only type display shortening so we + # are OK with any T whose name starts with CategoricalValue here + if startswith(sT, "CategoricalValue") || startswith(sT, "CategoricalArrays.CategoricalValue") sT = string(nameof(T)) if textwidth(sT) ≤ maxwidth return sT * "…" * suffix From e01c1fed86b790e35326da9b86ff5143d82880a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 6 Feb 2021 20:59:29 +0100 Subject: [PATCH 39/59] fix typo --- src/groupeddataframe/fastaggregates.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index 94077bef57..6e432d2ad9 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -122,7 +122,7 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax)) # !ismissing check is purely an optimization to avoid a copy later outcol = similar(incol, condf === !ismissing ? S : T, length(gd)) # Comparison is possible only between CatValues from the same pool - resT = typeof(outcol).name + resT = typeof(outcol) if nameof(resT) === :CategoricalArray && nameof(parentmodule(resT)) === :CategoricalArrays # we know that CategoricalArray has `pool` field outcol.pool = incol.pool From f50b9a127abc4d228b4593cc4a7ead6a17684ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 00:03:44 +0100 Subject: [PATCH 40/59] simplify isless and isequal for OnCol --- src/abstractdataframe/join.jl | 74 +++++------------------------------ 1 file changed, 9 insertions(+), 65 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index b2e59b1da8..111589ce17 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -149,29 +149,10 @@ Base.:(==)(x::OnColRow, y::OnColRow) = throw(MethodError(==, (x, y))) return @inbounds isequal(c11[r1], c21[r2]) && isequal(c12[r1], c22[r2]) end -@inline function Base.isequal(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, - ocr2::OnColRow{<:NTuple{3,AbstractVector}}) - r1 = ocr1.row - c11, c12, c13 = ocr1.cols - r2 = ocr2.row - c21, c22, c23 = ocr2.cols - - return @inbounds isequal(c11[r1], c21[r2]) && - isequal(c12[r1], c22[r2]) && isequal(c13[r1], c23[r2]) -end - -@inline function Base.isequal(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, - ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {N} - r1 = ocr1.row - cols1 = ocr1.cols - r2 = ocr2.row - cols2 = ocr2.cols - - @inbounds for i in 1:N - isequal(cols1[i][r1], cols2[i][r2]) || return false - end - return true -end +Base.isequal(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, + ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {N} = + isequal(ntuple(i -> @inbounds(ocr1.cols[i][ocr1.row]), N), + ntuple(i -> @inbounds(ocr2.cols[i][ocr2.row]), N)) @inline function Base.isless(ocr1::OnColRow{<:NTuple{2, AbstractVector}}, ocr2::OnColRow{<:NTuple{2, AbstractVector}}) @@ -185,50 +166,13 @@ end c21r = @inbounds c21[r2] c22r = @inbounds c22[r2] - isless(c11r, c21r) && return true - isequal(c11r, c21r) || return false - return isless(c12r, c22r) -end - -@inline function Base.isless(ocr1::OnColRow{<:NTuple{3,AbstractVector}}, - ocr2::OnColRow{<:NTuple{3,AbstractVector}}) - r1 = ocr1.row - c11, c12, c13 = ocr1.cols - r2 = ocr2.row - c21, c22, c23 = ocr2.cols - - c11r = @inbounds c11[r1] - c12r = @inbounds c12[r1] - c13r = @inbounds c13[r1] - c21r = @inbounds c21[r2] - c22r = @inbounds c22[r2] - c23r = @inbounds c23[r2] - - isless(c11r, c21r) && return true - isequal(c11r, c21r) || return false - isless(c12r, c22r) && return true - isequal(c12r, c22r) || return false - return isless(c13r, c23r) + isless(c11r, c21r) || (isequal(c11r, c21r) && isless(c12r, c22r)) end -@inline function Base.isless(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, - ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {T1, T2, N} - r1 = ocr1.row - cols1 = ocr1.cols - r2 = ocr2.row - cols2 = ocr2.cols - - lastcols1 = @inbounds cols1[1][r1] - lastcols2 = @inbounds cols2[1][r2] - isless(lastcols1, lastcols2) && return true - @inbounds for i in 2:N - isequal(lastcols1, lastcols2) || return false - lastcols1 = cols1[i][r1] - lastcols2 = cols2[i][r2] - isless(lastcols1, lastcols2) && return true - end - return false -end +@inline Base.isless(ocr1::OnColRow{<:NTuple{N,AbstractVector}}, + ocr2::OnColRow{<:NTuple{N,AbstractVector}}) where {N} = + isless(ntuple(i -> @inbounds(ocr1.cols[i][ocr1.row]), N), + ntuple(i -> @inbounds(ocr2.cols[i][ocr2.row]), N)) prepare_on_col() = throw(ArgumentError("at least one on column required when joining")) prepare_on_col(c::AbstractVector) = c From a592b097b714eb9897e857d0c385303a4ee81f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 20:16:20 +0100 Subject: [PATCH 41/59] Update test/join.jl Co-authored-by: Milan Bouchet-Valat --- test/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/join.jl b/test/join.jl index 266d5bf2b7..0eece43784 100644 --- a/test/join.jl +++ b/test/join.jl @@ -936,7 +936,7 @@ end innerjoin(df1_view2, df2, on=:a) end -@testset "OnCol correcntess tests" begin +@testset "OnCol correctness tests" begin Random.seed!(1234) c1 = collect(1:10^2) c2 = collect(Float64, 1:10^2) From 1515c07072f8a0bde2d1ab3f258dfb9f4131b7ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 20:16:57 +0100 Subject: [PATCH 42/59] add more tests --- test/join.jl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test/join.jl b/test/join.jl index 266d5bf2b7..44ce79017c 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1020,12 +1020,20 @@ end df2x = copy(df2) df2x.id2 = copy(df2x.id) + df1x2 = copy(df1x) + df1x2.id3 = copy(df1x2.id) + df2x2 = copy(df2x) + df2x2.id3 = copy(df2x2.id) + sort!(dfres) dfres2 = copy(dfres) insertcols!(dfres2, 3, :id2 => dfres2.id) + dfres3 = copy(dfres2) + insertcols!(dfres3, 4, :id3 => dfres3.id) return dfres ≅ sort(innerjoin(df1, df2, on=:id, matchmissing=:equal)) && - dfres2 ≅ sort(innerjoin(df1x, df2x, on=[:id, :id2], matchmissing=:equal)) + dfres2 ≅ sort(innerjoin(df1x, df2x, on=[:id, :id2], matchmissing=:equal)) && + dfres3 ≅ sort(innerjoin(df1x2, df2x2, on=[:id, :id2, :id3], matchmissing=:equal)) end Random.seed!(1234) From d8f1fe4022c8564375f23b3dd828b29a2a44a001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 20:21:36 +0100 Subject: [PATCH 43/59] additional tests --- test/join.jl | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/join.jl b/test/join.jl index a306a92cfe..f195eb7419 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1058,6 +1058,15 @@ end @test test_innerjoin(df1p, df2p) @test test_innerjoin(df1p, rename(df1p, :x => :y)) + df1p[1, 1] = 0 + df2p[1, 1] = 0 + df1p[1, 1] = 1 + df2p[1, 1] = 1 + @test test_innerjoin(df1, df2p) + @test test_innerjoin(df1p, df2) + @test test_innerjoin(df1p, df2p) + @test test_innerjoin(df1p, rename(df1p, :x => :y)) + df1c = copy(opleft(df1)) df1c[!, 1] = categorical(df1c[!, 1]) df2c = copy(opleft(df2)) @@ -1068,6 +1077,17 @@ end @test test_innerjoin(df1c, rename(df1c, :x => :y)) @test test_innerjoin(df1p, df2c) @test test_innerjoin(df1c, df2p) + + df1c[1, 1] = 0 + df2c[1, 1] = 0 + df1c[1, 1] = 1 + df2c[1, 1] = 1 + @test test_innerjoin(df1, df2c) + @test test_innerjoin(df1c, df2c) + @test test_innerjoin(df1c, df2) + @test test_innerjoin(df1c, rename(df1c, :x => :y)) + @test test_innerjoin(df1p, df2c) + @test test_innerjoin(df1c, df2p) end end end From bb8352791b82327b8cc044a3f2804b22d6a4d7cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 20:23:56 +0100 Subject: [PATCH 44/59] add innerjoin benchmark --- benchmarks/innerjoin_performance.jl | 233 ++++++++++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 benchmarks/innerjoin_performance.jl diff --git a/benchmarks/innerjoin_performance.jl b/benchmarks/innerjoin_performance.jl new file mode 100644 index 0000000000..780bd795c3 --- /dev/null +++ b/benchmarks/innerjoin_performance.jl @@ -0,0 +1,233 @@ +using CategoricalArrays +using CSV +using DataFrames +using Dates +using PooledArrays +using Random + +function run_innerjoin_tests(warmup::Bool = false) + warmup || run_innerjoin_tests(true) + Random.seed!(1234); + @info warmup ? "warmup" : "testing performance" + df = DataFrame(llen=[], rlen=[], type=[], time=[], alloc=[], gc=[]) + # change line below to match + # your preferred range of values tested + # and memory availability in your system + for llen in [10^3, 10^6], rlen in [2*10^7] + if warmup + llen, rlen = 1000, 1000 + else + println("\nSize:") + @show llen, rlen + end + println() + warmup || @info "sorted string unique" + df1, df2 = nothing, nothing + df1 = DataFrame(id = sort!(string.(1:llen)), copycols=false) + df2 = DataFrame(id = sort!(string.(1:rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted string unique", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted string unique", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled string unique" + df1, df2 = nothing, nothing + df1 = DataFrame(id = shuffle!(string.(1:llen)), copycols=false) + df2 = DataFrame(id = shuffle!(string.(1:rlen)), copycols=false) + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled string unique", x.time, x.bytes, x.gctime]) + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled string unique", x.time, x.bytes, x.gctime]) + warmup || @info "sorted string duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = sort!(rand(string.(1:llen), llen)), copycols=false) + df2 = DataFrame(id = sort!(rand(string.(1:rlen), rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted string duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted string duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted string duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = sort!(rand(string.(1:llen ÷ 100), llen)), copycols=false) + df2 = DataFrame(id = sort!(rand(string.(1:rlen ÷ 100), rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted string duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted string duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled string duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = rand(string.(1:llen), llen), copycols=false) + df2 = DataFrame(id = rand(string.(1:rlen), rlen), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled string duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled string duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled string duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = rand(string.(1:llen ÷ 100), llen), copycols=false) + df2 = DataFrame(id = rand(string.(1:rlen ÷ 100), rlen), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled string duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled string duplicates many", x.time, x.bytes, x.gctime]) + + warmup || @info "sorted int unique" + df1, df2 = nothing, nothing + df1 = DataFrame(id = sort!(1:llen), copycols=false) + df2 = DataFrame(id = sort!(1:rlen), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted int unique", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted int unique", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled int unique" + df1, df2 = nothing, nothing + df1 = DataFrame(id = shuffle(1:llen), copycols=false) + df2 = DataFrame(id = shuffle(1:rlen), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled int unique", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled int unique", x.time, x.bytes, x.gctime]) + warmup || @info "sorted int duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = sort!(rand(1:llen, llen)), copycols=false) + df2 = DataFrame(id = sort!(rand(1:rlen, rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted int duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted int duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted int duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = sort!(rand(1:llen ÷ 100, llen)), copycols=false) + df2 = DataFrame(id = sort!(rand(1:rlen ÷ 100, rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted int duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted int duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled int duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = rand(1:llen, llen), copycols=false) + df2 = DataFrame(id = rand(1:rlen, rlen), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled int duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled int duplicates", x.time, x.bytes, x.gctime]) + + warmup || @info "shuffled int duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = rand(1:llen ÷ 100, llen), copycols=false) + df2 = DataFrame(id = rand(1:rlen ÷ 100, rlen), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled int duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled int duplicates many", x.time, x.bytes, x.gctime]) + + warmup || @info "sorted PooledArray duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = PooledArray(rand(string.(1:llen), llen)), copycols=false) + df2 = DataFrame(id = PooledArray(repeat(string.(1:rlen ÷ 10), inner=10)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted PooledArray duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted PooledArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted PooledArray duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = PooledArray(rand(string.(1:llen ÷ 100), llen)), copycols=false) + df2 = DataFrame(id = PooledArray(repeat(string.(1:rlen ÷ 100), inner=10)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted PooledArray duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted PooledArray duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled PooledArray duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = PooledArray(rand(string.(1:llen), llen)), copycols=false) + df2 = DataFrame(id = PooledArray(rand(string.(1:rlen ÷ 10), rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled PooledArray duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled PooledArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled PooledArray duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = PooledArray(rand(string.(1:llen ÷ 100), llen)), copycols=false) + df2 = DataFrame(id = PooledArray(rand(string.(1:rlen ÷ 100), rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled PooledArray duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled PooledArray duplicates many", x.time, x.bytes, x.gctime]) + + warmup || @info "sorted CategoricalArray duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = categorical(rand(string.(1:llen), llen)), copycols=false) + df2 = DataFrame(id = categorical(repeat(string.(1:rlen ÷ 10), inner=10)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted CategoricalArray duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted CategoricalArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted CategoricalArray duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = categorical(rand(string.(1:llen ÷ 100), llen)), copycols=false) + df2 = DataFrame(id = categorical(repeat(string.(1:rlen ÷ 100), inner=10)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "sorted CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "sorted CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled CategoricalArray duplicates" + df1, df2 = nothing, nothing + df1 = DataFrame(id = categorical(rand(string.(1:llen), llen)), copycols=false) + df2 = DataFrame(id = categorical(rand(string.(1:rlen ÷ 10), rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled CategoricalArray duplicates", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled CategoricalArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled CategoricalArray duplicates many" + df1, df2 = nothing, nothing + df1 = DataFrame(id = categorical(rand(string.(1:llen ÷ 100), llen)), copycols=false) + df2 = DataFrame(id = categorical(rand(string.(1:rlen ÷ 100), rlen)), copycols=false) + GC.gc() + x = @timed innerjoin(df1, df2, on=:id) + push!(df, [llen, rlen, "shuffled CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) + GC.gc() + x = @timed innerjoin(df2, df1, on=:id) + push!(df, [llen, rlen, "shuffled CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) + + warmup && break + end + return df +end + +res = run_innerjoin_tests() +CSV.write("results_$(now()).csv", res) From 56c4c5e397bbe476df38d6031485815ded6f1e95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 21:44:03 +0100 Subject: [PATCH 45/59] more tests to ensure full coverage --- test/join.jl | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/test/join.jl b/test/join.jl index f195eb7419..d160f8e5e4 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1046,48 +1046,62 @@ end DataFrame(id=[rand(1:i, i); missing], y=1:i+1)] for opleft = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))], opright = [identity, sort, x -> unique(x, :id), x -> sort(unique(x, :id))] + + # integers @test test_innerjoin(opleft(df1), opright(df2)) @test test_innerjoin(opleft(df1), opright(rename(df1, :x => :y))) - df1p = copy(opleft(df1)) + # strings + df1s = copy(df1) + df1s[!, 1] = passmissing(string).(df1s[!, 1]) + df2s = copy(df2) + df2s[!, 1] = passmissing(string).(df2s[!, 1]) + @test test_innerjoin(opleft(df1s), opright(df2s)) + @test test_innerjoin(opleft(df1s), opright(rename(df1s, :x => :y))) + + # PooledArrays + df1p = copy(df1) df1p[!, 1] = PooledArray(df1p[!, 1]) - df2p = copy(opleft(df2)) + df2p = copy(df2) df2p[!, 1] = PooledArray(df2p[!, 1]) - @test test_innerjoin(df1, df2p) - @test test_innerjoin(df1p, df2) - @test test_innerjoin(df1p, df2p) - @test test_innerjoin(df1p, rename(df1p, :x => :y)) + @test test_innerjoin(opleft(df1), opright(df2p)) + @test test_innerjoin(opleft(df1p), opright(df2)) + @test test_innerjoin(opleft(df1p), opright(df2p)) + @test test_innerjoin(opleft(df1p), opright(rename(df1p, :x => :y))) + # add unused level df1p[1, 1] = 0 df2p[1, 1] = 0 df1p[1, 1] = 1 df2p[1, 1] = 1 - @test test_innerjoin(df1, df2p) - @test test_innerjoin(df1p, df2) - @test test_innerjoin(df1p, df2p) - @test test_innerjoin(df1p, rename(df1p, :x => :y)) + @test test_innerjoin(opleft(df1), opright(df2p)) + @test test_innerjoin(opleft(df1p), opright(df2)) + @test test_innerjoin(opleft(df1p), opright(df2p)) + @test test_innerjoin(opleft(df1p), opright(rename(df1p, :x => :y))) - df1c = copy(opleft(df1)) + # CategoricalArrays + df1c = copy(df1) df1c[!, 1] = categorical(df1c[!, 1]) - df2c = copy(opleft(df2)) + df2c = copy(df2) df2c[!, 1] = categorical(df2c[!, 1]) - @test test_innerjoin(df1, df2c) - @test test_innerjoin(df1c, df2c) - @test test_innerjoin(df1c, df2) - @test test_innerjoin(df1c, rename(df1c, :x => :y)) - @test test_innerjoin(df1p, df2c) - @test test_innerjoin(df1c, df2p) - + @test test_innerjoin(opleft(df1), opright(df2c)) + @test test_innerjoin(opleft(df1c), opright(df2c)) + @test test_innerjoin(opleft(df1c), opright(df2)) + @test test_innerjoin(opleft(df1c), opright(rename(df1c, :x => :y))) + @test test_innerjoin(opleft(df1p), opright(df2c)) + @test test_innerjoin(opleft(df1c), opright(df2p)) + + # add unused level df1c[1, 1] = 0 df2c[1, 1] = 0 df1c[1, 1] = 1 df2c[1, 1] = 1 - @test test_innerjoin(df1, df2c) - @test test_innerjoin(df1c, df2c) - @test test_innerjoin(df1c, df2) - @test test_innerjoin(df1c, rename(df1c, :x => :y)) - @test test_innerjoin(df1p, df2c) - @test test_innerjoin(df1c, df2p) + @test test_innerjoin(opleft(df1), opright(df2c)) + @test test_innerjoin(opleft(df1c), opright(df2c)) + @test test_innerjoin(opleft(df1c), opright(df2)) + @test test_innerjoin(opleft(df1c), opright(rename(df1c, :x => :y))) + @test test_innerjoin(opleft(df1p), opright(df2c)) + @test test_innerjoin(opleft(df1c), opright(df2p)) end end end From f6971773b8c65f337945a1f4e3f72bb30df5800b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 7 Feb 2021 22:50:15 +0100 Subject: [PATCH 46/59] add linebreaks at @info --- benchmarks/innerjoin_performance.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/benchmarks/innerjoin_performance.jl b/benchmarks/innerjoin_performance.jl index 780bd795c3..4abfd0f432 100644 --- a/benchmarks/innerjoin_performance.jl +++ b/benchmarks/innerjoin_performance.jl @@ -8,7 +8,9 @@ using Random function run_innerjoin_tests(warmup::Bool = false) warmup || run_innerjoin_tests(true) Random.seed!(1234); + @info warmup ? "warmup" : "testing performance" + df = DataFrame(llen=[], rlen=[], type=[], time=[], alloc=[], gc=[]) # change line below to match # your preferred range of values tested @@ -21,6 +23,7 @@ function run_innerjoin_tests(warmup::Bool = false) @show llen, rlen end println() + warmup || @info "sorted string unique" df1, df2 = nothing, nothing df1 = DataFrame(id = sort!(string.(1:llen)), copycols=false) @@ -31,6 +34,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted string unique", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled string unique" df1, df2 = nothing, nothing df1 = DataFrame(id = shuffle!(string.(1:llen)), copycols=false) @@ -39,6 +43,7 @@ function run_innerjoin_tests(warmup::Bool = false) push!(df, [llen, rlen, "shuffled string unique", x.time, x.bytes, x.gctime]) x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled string unique", x.time, x.bytes, x.gctime]) + warmup || @info "sorted string duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = sort!(rand(string.(1:llen), llen)), copycols=false) @@ -49,6 +54,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted string duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted string duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = sort!(rand(string.(1:llen ÷ 100), llen)), copycols=false) @@ -59,6 +65,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted string duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled string duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = rand(string.(1:llen), llen), copycols=false) @@ -69,6 +76,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled string duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled string duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = rand(string.(1:llen ÷ 100), llen), copycols=false) @@ -80,6 +88,7 @@ function run_innerjoin_tests(warmup::Bool = false) x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled string duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "sorted int unique" df1, df2 = nothing, nothing df1 = DataFrame(id = sort!(1:llen), copycols=false) @@ -90,6 +99,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted int unique", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled int unique" df1, df2 = nothing, nothing df1 = DataFrame(id = shuffle(1:llen), copycols=false) @@ -100,6 +110,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled int unique", x.time, x.bytes, x.gctime]) + warmup || @info "sorted int duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = sort!(rand(1:llen, llen)), copycols=false) @@ -110,6 +121,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted int duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted int duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = sort!(rand(1:llen ÷ 100, llen)), copycols=false) @@ -120,6 +132,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted int duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled int duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = rand(1:llen, llen), copycols=false) @@ -142,6 +155,7 @@ function run_innerjoin_tests(warmup::Bool = false) x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled int duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "sorted PooledArray duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = PooledArray(rand(string.(1:llen), llen)), copycols=false) @@ -152,6 +166,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted PooledArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted PooledArray duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = PooledArray(rand(string.(1:llen ÷ 100), llen)), copycols=false) @@ -162,6 +177,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted PooledArray duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled PooledArray duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = PooledArray(rand(string.(1:llen), llen)), copycols=false) @@ -172,6 +188,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled PooledArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled PooledArray duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = PooledArray(rand(string.(1:llen ÷ 100), llen)), copycols=false) @@ -183,6 +200,7 @@ function run_innerjoin_tests(warmup::Bool = false) x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled PooledArray duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "sorted CategoricalArray duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = categorical(rand(string.(1:llen), llen)), copycols=false) @@ -193,6 +211,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted CategoricalArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "sorted CategoricalArray duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = categorical(rand(string.(1:llen ÷ 100), llen)), copycols=false) @@ -203,6 +222,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "sorted CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled CategoricalArray duplicates" df1, df2 = nothing, nothing df1 = DataFrame(id = categorical(rand(string.(1:llen), llen)), copycols=false) @@ -213,6 +233,7 @@ function run_innerjoin_tests(warmup::Bool = false) GC.gc() x = @timed innerjoin(df2, df1, on=:id) push!(df, [llen, rlen, "shuffled CategoricalArray duplicates", x.time, x.bytes, x.gctime]) + warmup || @info "shuffled CategoricalArray duplicates many" df1, df2 = nothing, nothing df1 = DataFrame(id = categorical(rand(string.(1:llen ÷ 100), llen)), copycols=false) From fed4570b69adff07fc1a019a80ca888f9cdde4a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 9 Feb 2021 15:27:00 +0100 Subject: [PATCH 47/59] simplify loop in sorted case --- src/abstractdataframe/join.jl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 111589ce17..352e30e190 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -379,10 +379,16 @@ function _innerjoin_sorted(left::AbstractArray, right::AbstractArray) push!(left_ixs, left_cur) push!(right_ixs, right_cur) else - for (left_i, right_i) in Iterators.product(left_cur:left_new - 1, - right_cur:right_new - 1) - push!(left_ixs, left_i) - push!(right_ixs, right_i) + idx = length(left_ixs) + left_range = left_cur:left_new - 1 + right_range = right_cur:right_new - 1 + to_grow = Base.checked_mul(length(left_range), length(right_range)) + Base._growend!(left_ixs, to_grow) + Base._growend!(right_ixs, to_grow) + @inbounds for right_i in right_range, left_i in left_range + idx += 1 + left_ixs[idx] = left_i + right_ixs[idx] = right_i end end left_cur, left_val = left_new, left_tmp From d0fb0b94316279ea82333236afd683352a558769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 9 Feb 2021 17:03:20 +0100 Subject: [PATCH 48/59] use resize! --- src/abstractdataframe/join.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 352e30e190..786a2d9ca0 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -382,9 +382,9 @@ function _innerjoin_sorted(left::AbstractArray, right::AbstractArray) idx = length(left_ixs) left_range = left_cur:left_new - 1 right_range = right_cur:right_new - 1 - to_grow = Base.checked_mul(length(left_range), length(right_range)) - Base._growend!(left_ixs, to_grow) - Base._growend!(right_ixs, to_grow) + to_grow = Base.checked_add(idx, Base.checked_mul(length(left_range), length(right_range))) + resize!(left_ixs, to_grow) + resize!(right_ixs, to_grow) @inbounds for right_i in right_range, left_i in left_range idx += 1 left_ixs[idx] = left_i From 020eaae7bea291bc504ffd513fafedd93bdde2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 10 Feb 2021 15:48:16 +0100 Subject: [PATCH 49/59] add sizehint! --- src/abstractdataframe/join.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 786a2d9ca0..33a75c5cf9 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -414,6 +414,9 @@ end function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where {T} dict = Dict{T, Int}() + right_len = length(right) + sizehint!(dict, right_len) + right isa OnCol && _prehash(right) left isa OnCol && _prehash(left) @@ -429,6 +432,10 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where left_ixs = Int[] right_ixs = Int[] + # lower bound assuming we get matches + sizehint!(left_ixs, right_len) + sizehint!(right_ixs, right_len) + for (idx_l, val_l) in enumerate(left) # we use dict_index to make sure the following two operations are fast: # - if index is found - get it and process it @@ -479,6 +486,10 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} left_ixs = Int[] right_ixs = Int[] + right_len = length(right) + sizehint!(left_ixs, right_len) + sizehint!(right_ixs, right_len) + @inbounds for (idx_l, val_l) in enumerate(left) # we use dict_index to make sure the following two operations are fast: # - if index is found - get it and process it From a3de1c28e0546dad5aa0890766c7e10fa7b2601d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 09:27:01 +0100 Subject: [PATCH 50/59] avoid using internal functions --- src/abstractdataframe/join.jl | 45 +++++++++++++---------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 33a75c5cf9..31dcc0f09f 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -421,12 +421,8 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where left isa OnCol && _prehash(left) for (idx_r, val_r) in enumerate(right) - # we use dict_index to make sure the following two operations are fast: - # - if index is found - fall back to algorithm allowing duplicates - # - if index is not found - add it - dict_index = Base.ht_keyindex2!(dict, val_r) - dict_index > 0 && return _innerjoin_dup(left, right, dict, idx_r) - Base._setindex!(dict, idx_r, val_r, -dict_index) + haskey(dict, val_r) && return _innerjoin_dup(left, right, dict, idx_r) + dict[val_r] = idx_r end left_ixs = Int[] @@ -437,12 +433,9 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where sizehint!(right_ixs, right_len) for (idx_l, val_l) in enumerate(left) - # we use dict_index to make sure the following two operations are fast: - # - if index is found - get it and process it - # - if index is not found - do nothing - dict_index = Base.ht_keyindex(dict, val_l) - if dict_index > 0 # -1 if key not found - @inbounds idx_r = dict.vals[dict_index] + # we know that dict contains only positive values + idx_r = get(dict, val_l, -1) + if idx_r != -1 push!(left_ixs, idx_l) push!(right_ixs, idx_r) end @@ -520,18 +513,16 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, groups = Vector{Int}(undef, right_len) groups[1:ngroups] = 1:ngroups - for idx_r in idx_r_start:right_len - @inbounds val_r = right[idx_r] - # we use dict_index to make sure the following two operations are fast: - # - if index is found - process the row with existing group number - # - if index is not found - add a new group - dict_index = Base.ht_keyindex2!(dict, val_r) - if dict_index > 0 - @inbounds groups[idx_r] = dict.vals[dict_index] - else + @inbounds for idx_r in idx_r_start:right_len + val_r = right[idx_r] + # we know that group ids are positive + group_id = get(dict, val_r, -1) + if group_id == -1 ngroups += 1 - @inbounds groups[idx_r] = ngroups - Base._setindex!(dict, ngroups, val_r, -dict_index) + groups[idx_r] = ngroups + dict[val_r] = ngroups + else + groups[idx_r] = group_id end end @@ -597,12 +588,8 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, n = 0 @inbounds for (idx_l, val_l) in enumerate(left) - # we use dict_index to make sure the following two operations are fast: - # - if index is found - get it and process it - # - if index is not found - do nothing - dict_index = Base.ht_keyindex(dict, val_l) - if dict_index > 0 # -1 if key not found - group_id = dict.vals[dict_index] + group_id = get(dict, val_l, -1) + if group_id != -1 ref_stop = starts[group_id + 1] l = ref_stop - starts[group_id] newn = n + l From 659ec7cd37314aaa465ecd6cca073b16ba473f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 16:13:08 +0100 Subject: [PATCH 51/59] improved benchmark design --- benchmarks/innerjoin_performance.jl | 333 ++++++++-------------------- benchmarks/runtests.jl | 12 + 2 files changed, 100 insertions(+), 245 deletions(-) create mode 100644 benchmarks/runtests.jl diff --git a/benchmarks/innerjoin_performance.jl b/benchmarks/innerjoin_performance.jl index 4abfd0f432..390b39727d 100644 --- a/benchmarks/innerjoin_performance.jl +++ b/benchmarks/innerjoin_performance.jl @@ -1,254 +1,97 @@ using CategoricalArrays -using CSV using DataFrames -using Dates using PooledArrays using Random -function run_innerjoin_tests(warmup::Bool = false) - warmup || run_innerjoin_tests(true) - Random.seed!(1234); - - @info warmup ? "warmup" : "testing performance" - - df = DataFrame(llen=[], rlen=[], type=[], time=[], alloc=[], gc=[]) - # change line below to match - # your preferred range of values tested - # and memory availability in your system - for llen in [10^3, 10^6], rlen in [2*10^7] - if warmup - llen, rlen = 1000, 1000 - else - println("\nSize:") - @show llen, rlen - end - println() - - warmup || @info "sorted string unique" - df1, df2 = nothing, nothing - df1 = DataFrame(id = sort!(string.(1:llen)), copycols=false) - df2 = DataFrame(id = sort!(string.(1:rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted string unique", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted string unique", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled string unique" - df1, df2 = nothing, nothing - df1 = DataFrame(id = shuffle!(string.(1:llen)), copycols=false) - df2 = DataFrame(id = shuffle!(string.(1:rlen)), copycols=false) - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled string unique", x.time, x.bytes, x.gctime]) - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled string unique", x.time, x.bytes, x.gctime]) - - warmup || @info "sorted string duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = sort!(rand(string.(1:llen), llen)), copycols=false) - df2 = DataFrame(id = sort!(rand(string.(1:rlen), rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted string duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted string duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "sorted string duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = sort!(rand(string.(1:llen ÷ 100), llen)), copycols=false) - df2 = DataFrame(id = sort!(rand(string.(1:rlen ÷ 100), rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted string duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted string duplicates many", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled string duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = rand(string.(1:llen), llen), copycols=false) - df2 = DataFrame(id = rand(string.(1:rlen), rlen), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled string duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled string duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled string duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = rand(string.(1:llen ÷ 100), llen), copycols=false) - df2 = DataFrame(id = rand(string.(1:rlen ÷ 100), rlen), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled string duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled string duplicates many", x.time, x.bytes, x.gctime]) - - - warmup || @info "sorted int unique" - df1, df2 = nothing, nothing - df1 = DataFrame(id = sort!(1:llen), copycols=false) - df2 = DataFrame(id = sort!(1:rlen), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted int unique", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted int unique", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled int unique" - df1, df2 = nothing, nothing - df1 = DataFrame(id = shuffle(1:llen), copycols=false) - df2 = DataFrame(id = shuffle(1:rlen), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled int unique", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled int unique", x.time, x.bytes, x.gctime]) - - warmup || @info "sorted int duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = sort!(rand(1:llen, llen)), copycols=false) - df2 = DataFrame(id = sort!(rand(1:rlen, rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted int duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted int duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "sorted int duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = sort!(rand(1:llen ÷ 100, llen)), copycols=false) - df2 = DataFrame(id = sort!(rand(1:rlen ÷ 100, rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted int duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted int duplicates many", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled int duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = rand(1:llen, llen), copycols=false) - df2 = DataFrame(id = rand(1:rlen, rlen), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled int duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled int duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled int duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = rand(1:llen ÷ 100, llen), copycols=false) - df2 = DataFrame(id = rand(1:rlen ÷ 100, rlen), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled int duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled int duplicates many", x.time, x.bytes, x.gctime]) - - - warmup || @info "sorted PooledArray duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = PooledArray(rand(string.(1:llen), llen)), copycols=false) - df2 = DataFrame(id = PooledArray(repeat(string.(1:rlen ÷ 10), inner=10)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted PooledArray duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted PooledArray duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "sorted PooledArray duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = PooledArray(rand(string.(1:llen ÷ 100), llen)), copycols=false) - df2 = DataFrame(id = PooledArray(repeat(string.(1:rlen ÷ 100), inner=10)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted PooledArray duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted PooledArray duplicates many", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled PooledArray duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = PooledArray(rand(string.(1:llen), llen)), copycols=false) - df2 = DataFrame(id = PooledArray(rand(string.(1:rlen ÷ 10), rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled PooledArray duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled PooledArray duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled PooledArray duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = PooledArray(rand(string.(1:llen ÷ 100), llen)), copycols=false) - df2 = DataFrame(id = PooledArray(rand(string.(1:rlen ÷ 100), rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled PooledArray duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled PooledArray duplicates many", x.time, x.bytes, x.gctime]) - - - warmup || @info "sorted CategoricalArray duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = categorical(rand(string.(1:llen), llen)), copycols=false) - df2 = DataFrame(id = categorical(repeat(string.(1:rlen ÷ 10), inner=10)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted CategoricalArray duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted CategoricalArray duplicates", x.time, x.bytes, x.gctime]) - - warmup || @info "sorted CategoricalArray duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = categorical(rand(string.(1:llen ÷ 100), llen)), copycols=false) - df2 = DataFrame(id = categorical(repeat(string.(1:rlen ÷ 100), inner=10)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "sorted CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "sorted CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) - - warmup || @info "shuffled CategoricalArray duplicates" - df1, df2 = nothing, nothing - df1 = DataFrame(id = categorical(rand(string.(1:llen), llen)), copycols=false) - df2 = DataFrame(id = categorical(rand(string.(1:rlen ÷ 10), rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled CategoricalArray duplicates", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled CategoricalArray duplicates", x.time, x.bytes, x.gctime]) +fullgc() = (GC.gc(); GC.gc(); GC.gc(); GC.gc()) + +@assert length(ARGS) == 6 +@assert ARGS[3] in ["int", "pool", "cat", "str"] +@assert ARGS[4] in ["uniq", "dup", "manydup"] +@assert ARGS[5] in ["sort", "rand"] +@assert ARGS[6] in ["1", "2"] + +@info ARGS + +llen = parse(Int, ARGS[1]) +rlen = parse(Int, ARGS[2]) +@assert llen > 1000 +@assert rlen > 2000 + +pad = maximum(length.(string.((llen, rlen)))) + +if ARGS[3] == "int" + if ARGS[4] == "uniq" + col1 = [1:llen;] + col2 = [1:rlen;] + elseif ARGS[4] == "dup" + col1 = repeat(1:llen ÷ 2, inner=2) + col2 = repeat(1:rlen ÷ 2, inner=2) + else + @assert ARGS[4] == "manydup" + col1 = repeat(1:llen ÷ 20, inner=20) + col2 = repeat(1:rlen ÷ 20, inner=20) + end +elseif ARGS[3] == "pool" + if ARGS[4] == "dup" + col1 = PooledArray(repeat(string.(1:llen ÷ 2, pad=pad), inner=2)) + col2 = PooledArray(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2)) + else + @assert ARGS[4] == "manydup" + col1 = PooledArray(repeat(string.(1:llen ÷ 20, pad=pad), inner=20)) + col2 = PooledArray(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20)) + end +elseif ARGS[3] == "cat" + if ARGS[4] == "dup" + col1 = categorical(repeat(string.(1:llen ÷ 2, pad=pad), inner=2)) + col2 = categorical(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2)) + else + @assert ARGS[4] == "manydup" + col1 = categorical(repeat(string.(1:llen ÷ 20, pad=pad), inner=20)) + col2 = categorical(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20)) + end +else + @assert ARGS[3] == "str" + if ARGS[4] == "uniq" + col1 = string.(1:llen, pad=pad) + col2 = string.(1:rlen, pad=pad) + elseif ARGS[4] == "dup" + col1 = repeat(string.(1:llen ÷ 2, pad=pad), inner=2) + col2 = repeat(string.(1:rlen ÷ 2, pad=pad), inner=2) + else + @assert ARGS[4] == "manydup" + col1 = repeat(string.(1:llen ÷ 20, pad=pad), inner=20) + col2 = repeat(string.(1:rlen ÷ 20, pad=pad), inner=20) + end +end - warmup || @info "shuffled CategoricalArray duplicates many" - df1, df2 = nothing, nothing - df1 = DataFrame(id = categorical(rand(string.(1:llen ÷ 100), llen)), copycols=false) - df2 = DataFrame(id = categorical(rand(string.(1:rlen ÷ 100), rlen)), copycols=false) - GC.gc() - x = @timed innerjoin(df1, df2, on=:id) - push!(df, [llen, rlen, "shuffled CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) - GC.gc() - x = @timed innerjoin(df2, df1, on=:id) - push!(df, [llen, rlen, "shuffled CategoricalArray duplicates many", x.time, x.bytes, x.gctime]) +Random.seed!(1234) - warmup && break - end - return df +if ARGS[5] == "rand" + shuffle!(col1) + shuffle!(col2) +else + @assert ARGS[5] == "sort" end -res = run_innerjoin_tests() -CSV.write("results_$(now()).csv", res) +if ARGS[6] == "1" + df1 = DataFrame(id1 = col1); + df2 = DataFrame(id1 = col2); + innerjoin(df1[1:1000, :], df2[1:2000, :], on=:id1); + innerjoin(df2[1:2000, :], df1[1:1000, :], on=:id1); + fullgc(); + @time innerjoin(df1, df2, on=:id1); + fullgc(); + @time innerjoin(df2, df1, on=:id1); +else + @assert ARGS[6] == "2" + df1 = DataFrame(id1 = col1, id2 = col1); + df2 = DataFrame(id1 = col1, id2 = col1); + innerjoin(df1[1:1000, :], df2[1:2000, :], on=[:id1, :id2]); + innerjoin(df2[1:2000, :], df1[1:1000, :], on=[:id1, :id2]); + fullgc(); + @time innerjoin(df1, df2, on=[:id1, :id2]); + fullgc(); + @time innerjoin(df2, df1, on=[:id1, :id2]); + df2 = DataFrame(id1 = col2, id2 = col2); +end diff --git a/benchmarks/runtests.jl b/benchmarks/runtests.jl new file mode 100644 index 0000000000..a17959f576 --- /dev/null +++ b/benchmarks/runtests.jl @@ -0,0 +1,12 @@ +@assert length(ARGS) == 2 +file_loc = joinpath(dirname(@__FILE__), "innerjoin_performance.jl") +llen = ARGS[1] +rlen = ARGS[2] + +for a3 in ["str", "int", "pool", "cat"], + a4 in ["uniq", "dup", "manydup"], + a5 in ["sort", "rand"], + a6 in ["1", "2"] + a4 == "uniq" && a3 in ["pool", "cat"] && continue + run(`julia $file_loc $llen $rlen $a3 $a4 $a5 $a6`) +end From 07ecb0a34b2d6d30f95bb7ff570f4380238baa75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 16:13:26 +0100 Subject: [PATCH 52/59] Revert "avoid using internal functions" This reverts commit a3de1c28e0546dad5aa0890766c7e10fa7b2601d. --- src/abstractdataframe/join.jl | 45 ++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 31dcc0f09f..33a75c5cf9 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -421,8 +421,12 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where left isa OnCol && _prehash(left) for (idx_r, val_r) in enumerate(right) - haskey(dict, val_r) && return _innerjoin_dup(left, right, dict, idx_r) - dict[val_r] = idx_r + # we use dict_index to make sure the following two operations are fast: + # - if index is found - fall back to algorithm allowing duplicates + # - if index is not found - add it + dict_index = Base.ht_keyindex2!(dict, val_r) + dict_index > 0 && return _innerjoin_dup(left, right, dict, idx_r) + Base._setindex!(dict, idx_r, val_r, -dict_index) end left_ixs = Int[] @@ -433,9 +437,12 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where sizehint!(right_ixs, right_len) for (idx_l, val_l) in enumerate(left) - # we know that dict contains only positive values - idx_r = get(dict, val_l, -1) - if idx_r != -1 + # we use dict_index to make sure the following two operations are fast: + # - if index is found - get it and process it + # - if index is not found - do nothing + dict_index = Base.ht_keyindex(dict, val_l) + if dict_index > 0 # -1 if key not found + @inbounds idx_r = dict.vals[dict_index] push!(left_ixs, idx_l) push!(right_ixs, idx_r) end @@ -513,16 +520,18 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, groups = Vector{Int}(undef, right_len) groups[1:ngroups] = 1:ngroups - @inbounds for idx_r in idx_r_start:right_len - val_r = right[idx_r] - # we know that group ids are positive - group_id = get(dict, val_r, -1) - if group_id == -1 - ngroups += 1 - groups[idx_r] = ngroups - dict[val_r] = ngroups + for idx_r in idx_r_start:right_len + @inbounds val_r = right[idx_r] + # we use dict_index to make sure the following two operations are fast: + # - if index is found - process the row with existing group number + # - if index is not found - add a new group + dict_index = Base.ht_keyindex2!(dict, val_r) + if dict_index > 0 + @inbounds groups[idx_r] = dict.vals[dict_index] else - groups[idx_r] = group_id + ngroups += 1 + @inbounds groups[idx_r] = ngroups + Base._setindex!(dict, ngroups, val_r, -dict_index) end end @@ -588,8 +597,12 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, n = 0 @inbounds for (idx_l, val_l) in enumerate(left) - group_id = get(dict, val_l, -1) - if group_id != -1 + # we use dict_index to make sure the following two operations are fast: + # - if index is found - get it and process it + # - if index is not found - do nothing + dict_index = Base.ht_keyindex(dict, val_l) + if dict_index > 0 # -1 if key not found + group_id = dict.vals[dict_index] ref_stop = starts[group_id + 1] l = ref_stop - starts[group_id] newn = n + l From 58bdcf389b7987c95e801262337f6009cd51955b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 17:51:50 +0100 Subject: [PATCH 53/59] fix dict sizehint --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 33a75c5cf9..38fc33f656 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -415,7 +415,7 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where dict = Dict{T, Int}() right_len = length(right) - sizehint!(dict, right_len) + sizehint!(dict, 2 * min(right_len, typemax(Int) >> 2)) right isa OnCol && _prehash(right) left isa OnCol && _prehash(left) From d7bb989c6883273ee7bf6d2a06945ff4d9094ed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 18:38:09 +0100 Subject: [PATCH 54/59] add benchmark runner --- benchmarks/run.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 benchmarks/run.sh diff --git a/benchmarks/run.sh b/benchmarks/run.sh new file mode 100644 index 0000000000..f23eef6a3a --- /dev/null +++ b/benchmarks/run.sh @@ -0,0 +1,2 @@ +julia runtests.jl 100000 50000000 +julia runtests.jl 5000000 10000000 From f9882f8ee338e18f7184d509e65a51cb385da50a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 20:16:53 +0100 Subject: [PATCH 55/59] Revert "Revert "avoid using internal functions"" This reverts commit 07ecb0a34b2d6d30f95bb7ff570f4380238baa75. --- src/abstractdataframe/join.jl | 45 +++++++++++++---------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 38fc33f656..206f8edde7 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -421,12 +421,8 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where left isa OnCol && _prehash(left) for (idx_r, val_r) in enumerate(right) - # we use dict_index to make sure the following two operations are fast: - # - if index is found - fall back to algorithm allowing duplicates - # - if index is not found - add it - dict_index = Base.ht_keyindex2!(dict, val_r) - dict_index > 0 && return _innerjoin_dup(left, right, dict, idx_r) - Base._setindex!(dict, idx_r, val_r, -dict_index) + haskey(dict, val_r) && return _innerjoin_dup(left, right, dict, idx_r) + dict[val_r] = idx_r end left_ixs = Int[] @@ -437,12 +433,9 @@ function _innerjoin_unsorted(left::AbstractArray, right::AbstractArray{T}) where sizehint!(right_ixs, right_len) for (idx_l, val_l) in enumerate(left) - # we use dict_index to make sure the following two operations are fast: - # - if index is found - get it and process it - # - if index is not found - do nothing - dict_index = Base.ht_keyindex(dict, val_l) - if dict_index > 0 # -1 if key not found - @inbounds idx_r = dict.vals[dict_index] + # we know that dict contains only positive values + idx_r = get(dict, val_l, -1) + if idx_r != -1 push!(left_ixs, idx_l) push!(right_ixs, idx_r) end @@ -520,18 +513,16 @@ function _innerjoin_dup(left::AbstractArray, right::AbstractArray{T}, groups = Vector{Int}(undef, right_len) groups[1:ngroups] = 1:ngroups - for idx_r in idx_r_start:right_len - @inbounds val_r = right[idx_r] - # we use dict_index to make sure the following two operations are fast: - # - if index is found - process the row with existing group number - # - if index is not found - add a new group - dict_index = Base.ht_keyindex2!(dict, val_r) - if dict_index > 0 - @inbounds groups[idx_r] = dict.vals[dict_index] - else + @inbounds for idx_r in idx_r_start:right_len + val_r = right[idx_r] + # we know that group ids are positive + group_id = get(dict, val_r, -1) + if group_id == -1 ngroups += 1 - @inbounds groups[idx_r] = ngroups - Base._setindex!(dict, ngroups, val_r, -dict_index) + groups[idx_r] = ngroups + dict[val_r] = ngroups + else + groups[idx_r] = group_id end end @@ -597,12 +588,8 @@ function _innerjoin_postprocess(left::AbstractArray, dict::Dict{T, Int}, n = 0 @inbounds for (idx_l, val_l) in enumerate(left) - # we use dict_index to make sure the following two operations are fast: - # - if index is found - get it and process it - # - if index is not found - do nothing - dict_index = Base.ht_keyindex(dict, val_l) - if dict_index > 0 # -1 if key not found - group_id = dict.vals[dict_index] + group_id = get(dict, val_l, -1) + if group_id != -1 ref_stop = starts[group_id + 1] l = ref_stop - starts[group_id] newn = n + l From 8a31d99e7999266c22032ea9483216ea6d971e53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 11 Feb 2021 21:26:56 +0100 Subject: [PATCH 56/59] clean up script --- benchmarks/innerjoin_performance.jl | 35 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/benchmarks/innerjoin_performance.jl b/benchmarks/innerjoin_performance.jl index 390b39727d..1c28199c3b 100644 --- a/benchmarks/innerjoin_performance.jl +++ b/benchmarks/innerjoin_performance.jl @@ -3,7 +3,7 @@ using DataFrames using PooledArrays using Random -fullgc() = (GC.gc(); GC.gc(); GC.gc(); GC.gc()) +fullgc() = (GC.gc(true); GC.gc(true); GC.gc(true); GC.gc(true)) @assert length(ARGS) == 6 @assert ARGS[3] in ["int", "pool", "cat", "str"] @@ -75,23 +75,22 @@ else end if ARGS[6] == "1" - df1 = DataFrame(id1 = col1); - df2 = DataFrame(id1 = col2); - innerjoin(df1[1:1000, :], df2[1:2000, :], on=:id1); - innerjoin(df2[1:2000, :], df1[1:1000, :], on=:id1); - fullgc(); - @time innerjoin(df1, df2, on=:id1); - fullgc(); - @time innerjoin(df2, df1, on=:id1); + df1 = DataFrame(id1 = col1) + df2 = DataFrame(id1 = col2) + innerjoin(df1[1:1000, :], df2[1:2000, :], on=:id1) + innerjoin(df2[1:2000, :], df1[1:1000, :], on=:id1) + fullgc() + @time innerjoin(df1, df2, on=:id1) + fullgc() + @time innerjoin(df2, df1, on=:id1) else @assert ARGS[6] == "2" - df1 = DataFrame(id1 = col1, id2 = col1); - df2 = DataFrame(id1 = col1, id2 = col1); - innerjoin(df1[1:1000, :], df2[1:2000, :], on=[:id1, :id2]); - innerjoin(df2[1:2000, :], df1[1:1000, :], on=[:id1, :id2]); - fullgc(); - @time innerjoin(df1, df2, on=[:id1, :id2]); - fullgc(); - @time innerjoin(df2, df1, on=[:id1, :id2]); - df2 = DataFrame(id1 = col2, id2 = col2); + df1 = DataFrame(id1 = col1, id2 = col1) + df2 = DataFrame(id1 = col1, id2 = col1) + innerjoin(df1[1:1000, :], df2[1:2000, :], on=[:id1, :id2]) + innerjoin(df2[1:2000, :], df1[1:1000, :], on=[:id1, :id2]) + fullgc() + @time innerjoin(df1, df2, on=[:id1, :id2]) + fullgc() + @time innerjoin(df2, df1, on=[:id1, :id2]) end From 91df0e45f465613487734b00e217937334977b35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 12 Feb 2021 00:04:26 +0100 Subject: [PATCH 57/59] Update test/join.jl --- test/join.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/join.jl b/test/join.jl index d160f8e5e4..7e18111982 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1119,6 +1119,8 @@ end DataFrame(id=[]) @test innerjoin(DataFrame(id=Union{Int, Missing}[]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == DataFrame(id=[]) + @test innerjoin(DataFrame(id=Union{Int, Missing}[]), DataFrame(id=[2, 1]), on=:id, matchmissing=:equal) == + DataFrame(id=[]) @test innerjoin(DataFrame(id=Union{Int, Missing}[missing]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == DataFrame(id=[]) @test innerjoin(DataFrame(id=[missing]), DataFrame(id=[1, missing]), From 0b219724b35d46794e14ef2f4ff86a4bba349c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 12 Feb 2021 09:21:29 +0100 Subject: [PATCH 58/59] improve tests --- test/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/join.jl b/test/join.jl index 7e18111982..ff709e8420 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1119,7 +1119,7 @@ end DataFrame(id=[]) @test innerjoin(DataFrame(id=Union{Int, Missing}[]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == DataFrame(id=[]) - @test innerjoin(DataFrame(id=Union{Int, Missing}[]), DataFrame(id=[2, 1]), on=:id, matchmissing=:equal) == + @test innerjoin(DataFrame(id=Union{Int, Missing}[]), DataFrame(id=[2, 1, 2]), on=:id, matchmissing=:equal) == DataFrame(id=[]) @test innerjoin(DataFrame(id=Union{Int, Missing}[missing]), DataFrame(id=[1]), on=:id, matchmissing=:equal) == DataFrame(id=[]) From 1a9e664c9a4945b5a17b73c7f1f5b9bf7918960c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 13 Feb 2021 12:42:55 +0100 Subject: [PATCH 59/59] Update NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 9a01e761ad..acb4f2fb14 100644 --- a/NEWS.md +++ b/NEWS.md @@ -35,7 +35,7 @@ * `innerjoin` is now much faster and checks if passed data frames are sorted by the `on` columns and takes into account if shorter data frame that is joined - has unique values in `on` columns. These aspect of input data frames might affect + has unique values in `on` columns. These aspects of input data frames might affect the order of rows produced in the output ([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612))