Skip to content

Commit

Permalink
fix performance issue in multirow split-apply-combine (#2749)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored May 7, 2021
1 parent bcaa2e5 commit 32b86d4
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 33 deletions.
14 changes: 14 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# DataFrames.jl v1.1.1 Patch Release Notes

## Performance improvements

* fix performance issue when aggregation function produces multiple rows
in split-apply-combine
([2749](https://github.com/JuliaData/DataFrames.jl/pull/2749))
* `completecases` is now optimized and only processes columns that
can contain missing values; additionally it is now type stable and
always returns a `BitVector`
([#2726](https://github.com/JuliaData/DataFrames.jl/pull/2726))
* fix performance bottleneck when displaying wide tables
([#2750](https://github.com/JuliaData/DataFrames.jl/pull/2750))

# DataFrames.jl v1.1 Release Notes

## Functionality changes
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DataFrames"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.1.0"
version = "1.1.1"

[deps]
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
Expand Down
14 changes: 7 additions & 7 deletions src/groupeddataframe/callprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,36 +90,36 @@ end
function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::Tuple{AbstractVector}, i::Integer)
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(view(incols[1], idx))
end

function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::NTuple{2, AbstractVector}, i::Integer)
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(view(incols[1], idx), view(incols[2], idx))
end

function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::NTuple{3, AbstractVector}, i::Integer)
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(view(incols[1], idx), view(incols[2], idx), view(incols[3], idx))
end

function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::NTuple{4, AbstractVector}, i::Integer)
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(view(incols[1], idx), view(incols[2], idx), view(incols[3], idx),
view(incols[4], idx))
end

function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::Tuple, i::Integer)
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(map(c -> view(c, idx), incols)...)
end

Expand All @@ -129,14 +129,14 @@ function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
if f isa ByRow && isempty(incols)
return [f.fun(NamedTuple()) for _ in 1:(ends[i] - starts[i] + 1)]
else
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(map(c -> view(c, idx), incols))
end
end

function do_call(f::Base.Callable, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::Nothing, i::Integer)
idx = idx[starts[i]:ends[i]]
idx = view(idx, starts[i]:ends[i])
return f(view(parent(gd), idx, :))
end
26 changes: 21 additions & 5 deletions src/groupeddataframe/complextransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,28 +30,35 @@ function _combine_with_first((first,)::Ref{Any},
@assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
extrude = false

lgd = length(gd)
if first isa AbstractDataFrame
n = 0
eltys = eltype.(eachcol(first))
elseif first isa NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}
n = 0
eltys = map(eltype, first)
elseif first isa DataFrameRow
n = length(gd)
n = lgd
eltys = [eltype(parent(first)[!, i]) for i in parentcols(index(first))]
elseif !firstmulticol && first[1] isa Union{AbstractArray{<:Any, 0}, Ref}
extrude = true
first = wrap_row(first[1], firstcoltype(firstmulticol))
n = length(gd)
n = lgd
eltys = (typeof(first[1]),)
else # other NamedTuple giving a single row
n = length(gd)
n = lgd
eltys = map(typeof, first)
if any(x -> x <: AbstractVector, eltys)
throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed"))
end
end

idx = idx_agg === NOTHING_IDX_AGG ? Vector{Int}(undef, n) : idx_agg
# assume that we will have at least one row per group;
# if the user wants to drop some groups this will over-allocate idx
# but this use case is uncommon and sizehint! is cheap.
sizehint!(idx, lgd)

local initialcols
let eltys=eltys, n=n # Workaround for julia#15276
initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first))
Expand Down Expand Up @@ -362,7 +369,7 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame,
if !isempty(colnames) && length(gd) > 0
j = append_rows!(first, outcols, colstart, colnames)
@assert j === nothing # eltype is guaranteed to match
append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))
append_const!(idx, gdidx[starts[rowstart]], _nrow(first))
end
# Handle remaining groups
@inbounds for i in rowstart+1:len
Expand Down Expand Up @@ -397,7 +404,16 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame,
return _combine_tables_with_first!(rows, newcols, idx, i, j,
f, gd, incols, colnames, firstmulticol)
end
append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows)))
append_const!(idx, gdidx[starts[i]], _nrow(rows))
end
return outcols, colnames
end

function append_const!(idx::Vector{Int}, val::Int, growsize::Int)
if growsize > 0
oldsize = length(idx)
newsize = oldsize + growsize
resize!(idx, newsize)
idx[oldsize+1:newsize] .= val
end
end
44 changes: 24 additions & 20 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ function _combine_process_pair_symbol(optional_i::Bool,
if firstmulticol
throw(ArgumentError("a single value or vector result is required (got $(typeof(firstres)))"))
end
# if idx_agg was not computed yet it is nothing
# if idx_agg was not computed yet it is NOTHING_IDX_AGG
# in this case if we are not passed a vector compute it.
lock(gd.lazy_lock) do
if !(firstres isa AbstractVector) && idx_agg[] === NOTHING_IDX_AGG
Expand Down Expand Up @@ -543,7 +543,7 @@ function _combine(gd::GroupedDataFrame,
end
idx_keeprows = prepare_idx_keeprows(gd.idx, gd.starts, gd.ends, nrow(parent(gd)))
else
idx_keeprows = nothing
idx_keeprows = Int[]
end

idx_agg = Ref(NOTHING_IDX_AGG)
Expand Down Expand Up @@ -653,24 +653,9 @@ function _combine(gd::GroupedDataFrame,
# a correct index is stored in idx variable

@sync for i in eachindex(trans_res)
@spawn begin
col_idx = trans_res[i].col_idx
col = trans_res[i].col
if keeprows && col_idx !== idx_keeprows # we need to reorder the column
newcol = similar(col)
# we can probably make it more efficient, but I leave it as an optimization for the future
gd_idx = gd.idx
k = 0
# consider adding @inbounds later
for (s, e) in zip(gd.starts, gd.ends)
for j in s:e
k += 1
newcol[gd_idx[j]] = col[k]
end
end
@assert k == length(gd_idx)
trans_res[i] = TransformationResult(col_idx, newcol, trans_res[i].name, trans_res[i].optional)
end
let i=i
@spawn reorder_cols!(trans_res, i, trans_res[i].col, trans_res[i].col_idx,
keeprows, idx_keeprows, gd)
end
end

Expand All @@ -682,6 +667,25 @@ function _combine(gd::GroupedDataFrame,
return idx, DataFrame(outcols, nms, copycols=false)
end

function reorder_cols!(trans_res::Vector{TransformationResult}, i::Integer,
col::AbstractVector, col_idx::Vector{Int}, keeprows::Bool,
idx_keeprows::Vector{Int}, gd::GroupedDataFrame)
if keeprows && col_idx !== idx_keeprows # we need to reorder the column
newcol = similar(col)
# we can probably make it more efficient, but I leave it as an optimization for the future
gd_idx = gd.idx
k = 0
for (s, e) in zip(gd.starts, gd.ends)
for j in s:e
k += 1
@inbounds newcol[gd_idx[j]] = col[k]
end
end
@assert k == length(gd_idx)
trans_res[i] = TransformationResult(col_idx, newcol, trans_res[i].name, trans_res[i].optional)
end
end

function combine(@nospecialize(f::Base.Callable), gd::GroupedDataFrame;
keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true)
if f isa Colon
Expand Down

0 comments on commit 32b86d4

Please sign in to comment.