Skip to content

Commit

Permalink
Performance of transform! on SubDataFrame (#3070)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Jun 14, 2022
1 parent 332077d commit 0ce9b0f
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 25 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@

* Make sure that `AsTable` accepts only valid argument
([#3064](https://github.com/JuliaData/DataFrames.jl/pull/3064))
* Make sure we avoid aliasing when repeating the same column
in `select[!]` and `transform[!]` on `GroupedDataFrame`
([#3070](https://github.com/JuliaData/DataFrames.jl/pull/3070))

## Performance

Expand All @@ -63,6 +66,8 @@
* Make one-dimensional multi-element indexing of `DataFrameRows` return
`DataFrameRows`
([#3037](https://github.com/JuliaData/DataFrames.jl/pull/3037))
* Make `transform!` on `SubDataFrame` faster
([#3070](https://github.com/JuliaData/DataFrames.jl/pull/3070))

# DataFrames.jl v1.3.4 Patch Release Notes

Expand Down
2 changes: 1 addition & 1 deletion src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2534,7 +2534,7 @@ function _permutation_helper!(fun::Union{typeof(Base.permute!!), typeof(Base.inv
end

seen_cols = IdDict{Any, Nothing}()
for (i, col) in enumerate(eachcol(df))
for col in eachcol(df)
if !haskey(seen_cols, col)
seen_cols[col] = nothing
_cycle_permute!(col, cp)
Expand Down
19 changes: 14 additions & 5 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@ const TRANSFORMATION_COMMON_RULES =
Operations can then be applied on each group using one of the following functions:
* `combine`: does not put restrictions on number of rows returned, the order of rows
is specified by the order of groups in `GroupedDataFrame`; it is typically used
to compute summary statistics by group;
to compute summary statistics by group; for `GroupedDataFrame` if grouping columns
are kept they are put as first columns in the result;
* `select`: return a data frame with the number and order of rows exactly the same
as the source data frame, including only new calculated columns;
`select!` is an in-place version of `select`;
`select!` is an in-place version of `select`; for `GroupedDataFrame` if grouping columns
are kept they are put as first columns in the result;
* `transform`: return a data frame with the number and order of rows exactly the same
as the source data frame, including all columns from the source and new calculated columns;
`transform!` is an in-place version of `transform`.
`transform!` is an in-place version of `transform`; for `GroupedDataFrame`
existing columns in the source data frame are put as first columns in the result;
All these functions take a specification of one or more functions to apply to
each subset of the `DataFrame`. This specification can be of the following forms:
Expand Down Expand Up @@ -707,6 +710,7 @@ function _add_col_check_copy(newdf::DataFrame, df::AbstractDataFrame,
if v === cdf[i]
if column_to_copy[i]
must_copy = true
break
else
column_to_copy[i] = true
end
Expand Down Expand Up @@ -927,7 +931,8 @@ select!(df::DataFrame, @nospecialize(args...);
select!(df::SubDataFrame, @nospecialize(args...);
renamecols::Bool=true, threads::Bool=true) =
_replace_columns!(df, select(df, args..., copycols=true,
renamecols=renamecols, threads=threads))
renamecols=renamecols, threads=threads),
keep_present=false)

function select!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame;
renamecols::Bool=true, threads::Bool=true)
Expand Down Expand Up @@ -966,10 +971,14 @@ $TRANSFORMATION_COMMON_RULES
See [`select`](@ref) for examples.
"""
transform!(df::AbstractDataFrame, @nospecialize(args...);
transform!(df::DataFrame, @nospecialize(args...);
renamecols::Bool=true, threads::Bool=true) =
select!(df, :, args..., renamecols=renamecols, threads=threads)

transform!(df::SubDataFrame, @nospecialize(args...); renamecols::Bool=true, threads::Bool=true) =
_replace_columns!(df, select(df, args..., copycols=true, renamecols=renamecols, threads=threads),
keep_present=true)

function transform!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame;
renamecols::Bool=true, threads::Bool=true)
if arg isa Colon
Expand Down
2 changes: 2 additions & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1933,6 +1933,8 @@ end

# This is not exactly copy! as in general we allow axes to be different
function _replace_columns!(df::DataFrame, newdf::DataFrame)
# for DataFrame object here we do not support keep_present keyword argument
# like for SubDataFrame because here transform! always falls back to select!
@assert ncol(newdf) == 0 || nrow(df) == nrow(newdf)
copy!(_columns(df), _columns(newdf))
copy!(_names(index(df)), _names(newdf))
Expand Down
44 changes: 32 additions & 12 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,17 @@ combine(gd::GroupedDataFrame,
copycols=true, keeprows=false, renamecols=renamecols,
threads=threads)

function _dealias_dataframe!(df::DataFrame)
seen_cols = IdDict{Any, Nothing}()
for (i, col) in enumerate(eachcol(df))
if !haskey(seen_cols, col)
seen_cols[col] = nothing
else
df[!, i] = df[:, i]
end
end
end

function select(@nospecialize(f::Base.Callable), gd::GroupedDataFrame; copycols::Bool=true,
keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true,
threads::Bool=true)
Expand All @@ -840,14 +851,19 @@ function select(@nospecialize(f::Base.Callable), gd::GroupedDataFrame; copycols:
threads=threads)
end

select(gd::GroupedDataFrame, @nospecialize(args::Union{Pair, Base.Callable, ColumnIndex, MultiColumnIndex,
AbstractVecOrMat}...);
copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true,
threads::Bool=true) =
_combine_prepare(gd, Ref{Any}(map(x -> broadcast_pair(parent(gd), x), args)),
copycols=copycols, keepkeys=keepkeys,
ungroup=ungroup, keeprows=true, renamecols=renamecols,
threads=threads)
function select(gd::GroupedDataFrame, @nospecialize(args::Union{Pair, Base.Callable, ColumnIndex,
MultiColumnIndex, AbstractVecOrMat}...);
copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true,
threads::Bool=true)
res = _combine_prepare(gd, Ref{Any}(map(x -> broadcast_pair(parent(gd), x), args)),
copycols=copycols, keepkeys=keepkeys,
ungroup=ungroup, keeprows=true, renamecols=renamecols,
threads=threads)
# res can be a GroupedDataFrame based on DataFrame or a DataFrame,
# so parent always gives a DataFrame
copycols || _dealias_dataframe!(parent(res))
return res
end

function transform(@nospecialize(f::Base.Callable), gd::GroupedDataFrame; copycols::Bool=true,
keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true,
Expand Down Expand Up @@ -888,12 +904,13 @@ function select!(gd::GroupedDataFrame,
if df isa DataFrame
newdf = select(gd, args..., copycols=false, renamecols=renamecols,
threads=threads)
_replace_columns!(df, newdf)
else
@assert df isa SubDataFrame
newdf = select(gd, args..., copycols=true, renamecols=renamecols,
threads=threads)
_replace_columns!(df, newdf, keep_present=false)
end
_replace_columns!(df, newdf)
return ungroup ? df : gd
end

Expand All @@ -913,12 +930,15 @@ function transform!(gd::GroupedDataFrame,
if df isa DataFrame
newdf = select(gd, :, args..., copycols=false, renamecols=renamecols,
threads=threads)
# need to recover column order of df in newdf and add new columns at the end
select!(newdf, propertynames(df), :, threads=threads)
_replace_columns!(df, newdf)
else
@assert df isa SubDataFrame
newdf = select(gd, :, args..., copycols=true, renamecols=renamecols,
newdf = select(gd, args..., copycols=true, renamecols=renamecols,
threads=threads)
# here column order of df is retained due to keep_present=true
_replace_columns!(df, newdf, keep_present=true)
end
select!(newdf, propertynames(df), :, threads=threads)
_replace_columns!(df, newdf)
return ungroup ? df : gd
end
22 changes: 15 additions & 7 deletions src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ end

Base.convert(::Type{DataFrame}, sdf::SubDataFrame) = DataFrame(sdf)

# this function tests if it is allowed to add columns to passed SubDataFrame
# this function tests if it is allowed to add/remove/reorder columns to passed SubDataFrame
# currently it is only allowed when SubDataFrame was created with : as column selector
# which results in using Index as its index (as opposed to other columns selectors
# which result in SubIndex)
Expand All @@ -316,9 +316,12 @@ function is_column_insertion_allowed(df::AbstractDataFrame)
throw(ArgumentError("Unsupported data frame type"))
end

function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame)
colsmatch = _names(sdf) == _names(newdf)

function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame; keep_present::Bool)
if keep_present
colsmatch = issubset(_names(newdf), _names(sdf))
else
colsmatch = _names(newdf) == _names(sdf)
end
if !(colsmatch || is_column_insertion_allowed(sdf))
throw(ArgumentError("changing the sequence of column names in a SubDataFrame " *
"that subsets columns of its parent data frame is disallowed"))
Expand All @@ -329,14 +332,19 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame)
sdf[!, colname] = newdf[!, colname]
end

# If columns did not match this means that we have either:
# 1. inserted some columns into pdf
# if _replace_columns! was called from transform we are done as we want to
# keep all columns that were previously present.
# In this case column order will be correct.
# Otherwise if columns did not match this means that we have either:
# 1. inserted some columns into newdf
# or
# 2. requested to reorder the existing columns
# or
# 3. dropped some columns in newdf
# and that operation was allowed.
# Therefore we need to update the parent of sdf in place to make sure
# it holds only the required target columns in a correct order.
if !colsmatch
if !keep_present && !colsmatch
pdf = parent(sdf)
@assert pdf isa DataFrame
select!(pdf, _names(newdf))
Expand Down
53 changes: 53 additions & 0 deletions test/select.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2707,4 +2707,57 @@ end
@test_throws ArgumentError select(gdf, :a => [:x, :y, :z])
end

@testset "handling of operation specification in select!/transform!" begin
df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
select!(df, :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df == DataFrame(b='a':'d', d=["p", "q", "r", "s"], e=[2, 4, 6, 8])

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
transform!(df, :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df == DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"],
d=["p", "q", "r", "s"], e=[2, 4, 6, 8])
@test df.c !== df.d

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
select!(groupby(df, :c), :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df == DataFrame(c=["p", "q", "r", "s"], b='a':'d',
d=["p", "q", "r", "s"], e=[2, 4, 6, 8])
@test df.c !== df.d

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
transform!(groupby(df, :c), :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df == DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"],
d=["p", "q", "r", "s"], e=[2, 4, 6, 8])
@test df.c !== df.d

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
sdf = @view df[[2, 4], :]
# note that in this test a column from parent of sdf is dropped
select!(sdf, :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df DataFrame(b='a':'d', d=[missing, "q", missing, "s"],
e=[missing, 4, missing, 8])

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
sdf = @view df[[2, 4], :]
transform!(sdf, :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"],
d=[missing, "q", missing, "s"],
e=[missing, 4, missing, 8])

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
sdf = @view df[[2, 4], :]
# note that in this test a column from parent of sdf is dropped
select!(groupby(sdf, :c), :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df DataFrame(c=["p", "q", "r", "s"], b='a':'d',
d=[missing, "q", missing, "s"],
e=[missing, 4, missing, 8])

df = DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"])
sdf = @view df[[2, 4], :]
transform!(groupby(sdf, :c), :b, :c => :d, :a => (x -> 2 * x) => :e)
@test df DataFrame(a=1:4, b='a':'d', c=["p", "q", "r", "s"],
d=[missing, "q", missing, "s"],
e=[missing, 4, missing, 8])
end

end # module

0 comments on commit 0ce9b0f

Please sign in to comment.