From 32b86d460f0a77254e4e73af6072ee5109b87885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 7 May 2021 11:55:19 +0200 Subject: [PATCH] fix performance issue in multirow split-apply-combine (#2749) --- NEWS.md | 14 ++++++++ Project.toml | 2 +- src/groupeddataframe/callprocessing.jl | 14 ++++---- src/groupeddataframe/complextransforms.jl | 26 +++++++++++--- src/groupeddataframe/splitapplycombine.jl | 44 ++++++++++++----------- 5 files changed, 67 insertions(+), 33 deletions(-) diff --git a/NEWS.md b/NEWS.md index 391d8758f0..c35d3628ec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# DataFrames.jl v1.1.1 Patch Release Notes + +## Performance improvements + +* fix performance issue when aggregation function produces multiple rows + in split-apply-combine + ([2749](https://github.com/JuliaData/DataFrames.jl/pull/2749)) +* `completecases` is now optimized and only processes columns that + can contain missing values; additionally it is now type stable and + always returns a `BitVector` + ([#2726](https://github.com/JuliaData/DataFrames.jl/pull/2726)) +* fix performance bottleneck when displaying wide tables + ([#2750](https://github.com/JuliaData/DataFrames.jl/pull/2750)) + # DataFrames.jl v1.1 Release Notes ## Functionality changes diff --git a/Project.toml b/Project.toml index 3dfa72611f..56023aac78 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DataFrames" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.1.0" +version = "1.1.1" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" diff --git a/src/groupeddataframe/callprocessing.jl b/src/groupeddataframe/callprocessing.jl index e52def8dcf..740a9df735 100644 --- a/src/groupeddataframe/callprocessing.jl +++ b/src/groupeddataframe/callprocessing.jl @@ -90,28 +90,28 @@ end function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::Tuple{AbstractVector}, i::Integer) - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(view(incols[1], idx)) end function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::NTuple{2, AbstractVector}, i::Integer) - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(view(incols[1], idx), view(incols[2], idx)) end function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::NTuple{3, AbstractVector}, i::Integer) - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(view(incols[1], idx), view(incols[2], idx), view(incols[3], idx)) end function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::NTuple{4, AbstractVector}, i::Integer) - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(view(incols[1], idx), view(incols[2], idx), view(incols[3], idx), view(incols[4], idx)) end @@ -119,7 +119,7 @@ end function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::Tuple, i::Integer) - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(map(c -> view(c, idx), incols)...) end @@ -129,7 +129,7 @@ function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, if f isa ByRow && isempty(incols) return [f.fun(NamedTuple()) for _ in 1:(ends[i] - starts[i] + 1)] else - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(map(c -> view(c, idx), incols)) end end @@ -137,6 +137,6 @@ end function do_call(f::Base.Callable, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::Nothing, i::Integer) - idx = idx[starts[i]:ends[i]] + idx = view(idx, starts[i]:ends[i]) return f(view(parent(gd), idx, :)) end diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl index 9585574d20..e7b51b0e7e 100644 --- a/src/groupeddataframe/complextransforms.jl +++ b/src/groupeddataframe/complextransforms.jl @@ -30,6 +30,7 @@ function _combine_with_first((first,)::Ref{Any}, @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame} extrude = false + lgd = length(gd) if first isa AbstractDataFrame n = 0 eltys = eltype.(eachcol(first)) @@ -37,21 +38,27 @@ function _combine_with_first((first,)::Ref{Any}, n = 0 eltys = map(eltype, first) elseif first isa DataFrameRow - n = length(gd) + n = lgd eltys = [eltype(parent(first)[!, i]) for i in parentcols(index(first))] elseif !firstmulticol && first[1] isa Union{AbstractArray{<:Any, 0}, Ref} extrude = true first = wrap_row(first[1], firstcoltype(firstmulticol)) - n = length(gd) + n = lgd eltys = (typeof(first[1]),) else # other NamedTuple giving a single row - n = length(gd) + n = lgd eltys = map(typeof, first) if any(x -> x <: AbstractVector, eltys) throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed")) end end + idx = idx_agg === NOTHING_IDX_AGG ? Vector{Int}(undef, n) : idx_agg + # assume that we will have at least one row per group; + # if the user wants to drop some groups this will over-allocate idx + # but this use case is uncommon and sizehint! is cheap. + sizehint!(idx, lgd) + local initialcols let eltys=eltys, n=n # Workaround for julia#15276 initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first)) @@ -362,7 +369,7 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, if !isempty(colnames) && length(gd) > 0 j = append_rows!(first, outcols, colstart, colnames) @assert j === nothing # eltype is guaranteed to match - append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first))) + append_const!(idx, gdidx[starts[rowstart]], _nrow(first)) end # Handle remaining groups @inbounds for i in rowstart+1:len @@ -397,7 +404,16 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, return _combine_tables_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames, firstmulticol) end - append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows))) + append_const!(idx, gdidx[starts[i]], _nrow(rows)) end return outcols, colnames end + +function append_const!(idx::Vector{Int}, val::Int, growsize::Int) + if growsize > 0 + oldsize = length(idx) + newsize = oldsize + growsize + resize!(idx, newsize) + idx[oldsize+1:newsize] .= val + end +end diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index e2336689fe..899aae8de9 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -338,7 +338,7 @@ function _combine_process_pair_symbol(optional_i::Bool, if firstmulticol throw(ArgumentError("a single value or vector result is required (got $(typeof(firstres)))")) end - # if idx_agg was not computed yet it is nothing + # if idx_agg was not computed yet it is NOTHING_IDX_AGG # in this case if we are not passed a vector compute it. lock(gd.lazy_lock) do if !(firstres isa AbstractVector) && idx_agg[] === NOTHING_IDX_AGG @@ -543,7 +543,7 @@ function _combine(gd::GroupedDataFrame, end idx_keeprows = prepare_idx_keeprows(gd.idx, gd.starts, gd.ends, nrow(parent(gd))) else - idx_keeprows = nothing + idx_keeprows = Int[] end idx_agg = Ref(NOTHING_IDX_AGG) @@ -653,24 +653,9 @@ function _combine(gd::GroupedDataFrame, # a correct index is stored in idx variable @sync for i in eachindex(trans_res) - @spawn begin - col_idx = trans_res[i].col_idx - col = trans_res[i].col - if keeprows && col_idx !== idx_keeprows # we need to reorder the column - newcol = similar(col) - # we can probably make it more efficient, but I leave it as an optimization for the future - gd_idx = gd.idx - k = 0 - # consider adding @inbounds later - for (s, e) in zip(gd.starts, gd.ends) - for j in s:e - k += 1 - newcol[gd_idx[j]] = col[k] - end - end - @assert k == length(gd_idx) - trans_res[i] = TransformationResult(col_idx, newcol, trans_res[i].name, trans_res[i].optional) - end + let i=i + @spawn reorder_cols!(trans_res, i, trans_res[i].col, trans_res[i].col_idx, + keeprows, idx_keeprows, gd) end end @@ -682,6 +667,25 @@ function _combine(gd::GroupedDataFrame, return idx, DataFrame(outcols, nms, copycols=false) end +function reorder_cols!(trans_res::Vector{TransformationResult}, i::Integer, + col::AbstractVector, col_idx::Vector{Int}, keeprows::Bool, + idx_keeprows::Vector{Int}, gd::GroupedDataFrame) + if keeprows && col_idx !== idx_keeprows # we need to reorder the column + newcol = similar(col) + # we can probably make it more efficient, but I leave it as an optimization for the future + gd_idx = gd.idx + k = 0 + for (s, e) in zip(gd.starts, gd.ends) + for j in s:e + k += 1 + @inbounds newcol[gd_idx[j]] = col[k] + end + end + @assert k == length(gd_idx) + trans_res[i] = TransformationResult(col_idx, newcol, trans_res[i].name, trans_res[i].optional) + end +end + function combine(@nospecialize(f::Base.Callable), gd::GroupedDataFrame; keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) if f isa Colon