diff --git a/NEWS.md b/NEWS.md index 241b81279d..a21e658724 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,11 @@ * `mapcols!` makes sure not to create columns being `AbstractRange` consistently with other methods that add columns to a `DataFrame` ([#2594](https://github.com/JuliaData/DataFrames.jl/pull/2594)) +* `transform` and `transform!` always copy columns when column renaming transformation + is passed. If similar issues are identified after 1.0 release (i.e. that a + copy of data is not made in scenarios where it normally should be made these + will be considered bugs and fixed as non-breaking changes) + ([#2721](https://github.com/JuliaData/DataFrames.jl/pull/2721)) ## New functionalities diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 4e63ba30d1..2aaf48a28e 100755 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -138,6 +138,8 @@ const TRANSFORMATION_COMMON_RULES = transformations that return the same object without copying may create column aliases even if `copycols=true`. An example of such a situation is `select!(df, :a, :a => :b, :a => identity => :c)`. + As a special case in `transform` and `transform!` column renaming always + copies columns to avoid storing aliased columns in the target data frame. If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is returned and the same copying rules apply as for a `DataFrame` input: this @@ -270,14 +272,14 @@ function normalize_selection(idx::AbstractIndex, allunique(combine_target_col) || throw(ArgumentError("target column names must be unique")) end - if wanttable - combine_src = AsTable(c) - else + if wanttable + combine_src = AsTable(c) + else combine_src = (length(c) == 1 ? only(c) : c) end combine_func = first(last(sel)) - + return combine_src => combine_func => combine_target_col end @@ -338,9 +340,9 @@ function normalize_selection(idx::AbstractIndex, end end - if wanttable - combine_src = AsTable(c) - else + if wanttable + combine_src = AsTable(c) + else combine_src = (length(c) == 1 ? only(c) : c) end @@ -656,8 +658,17 @@ $TRANSFORMATION_COMMON_RULES See [`select`](@ref) for examples. """ -transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) = - select!(df, :, args..., renamecols=renamecols) +function transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) + idx = index(df) + newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol} + idx[first(sel)] => copy => last(sel) + elseif sel isa Pair{<:ColumnIndex, <:AbstractString} + idx[first(sel)] => copy => Symbol(last(sel)) + else + sel + end for sel in args] + return select!(df, :, newargs..., renamecols=renamecols) +end function transform!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon @@ -978,8 +989,20 @@ ERROR: ArgumentError: column :x in returned data frame is not equal to grouping See [`select`](@ref) for more examples. """ -transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) = - select(df, :, args..., copycols=copycols, renamecols=renamecols) +function transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) + idx = index(df) + # when using the copy function the copy of source data frame + # is made exactly once even if copycols=true + # (copycols=true makes a copy only if the column was not copied previously) + newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol} + idx[first(sel)] => copy => last(sel) + elseif sel isa Pair{<:ColumnIndex, <:AbstractString} + idx[first(sel)] => copy => Symbol(last(sel)) + else + sel + end for sel in args] + return select(df, :, newargs..., copycols=copycols, renamecols=renamecols) +end function transform(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon diff --git a/src/other/precompile.jl b/src/other/precompile.jl index da211ac8b9..10d60892db 100644 --- a/src/other/precompile.jl +++ b/src/other/precompile.jl @@ -134,6 +134,9 @@ function precompile(all=false) end Base.precompile(Tuple{typeof(row_group_slots),Tuple{RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, String}}, Vector{Union{Missing, Int}}},Val{false},Vector{Int},Bool,Bool}) Base.precompile(Tuple{typeof(combine),Union{Function, Type},GroupedDataFrame{DataFrame}}) + Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}}) + Base.precompile(Tuple{typeof(transform!),DataFrame,Any}) + Base.precompile(Tuple{typeof(transform),DataFrame,Any}) else Base.precompile(Tuple{typeof(_sortperm),DataFrame,Base.Sort.MergeSortAlg,DFPerm{Vector{Ordering}, Tuple{Vector{Int}, Vector{Int}, Vector{Int}}}}) Base.precompile(Tuple{typeof(flatten),DataFrame,All{Tuple{}}}) @@ -1030,6 +1033,10 @@ function precompile(all=false) end Base.precompile(Tuple{typeof(row_group_slots),Tuple{Vector{Float64}},Val{false},Vector{Int},Bool,Bool}) Base.precompile(Tuple{typeof(transform),DataFrame,Any}) + Base.precompile(Tuple{typeof(transform!),DataFrame,Any}) + Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}}) + Base.precompile(Tuple{Core.kwftype(typeof(select)),NamedTuple{(:copycols, :renamecols), Tuple{Bool, Bool}},typeof(select),DataFrame,Any,Any}) + Base.precompile(Tuple{Core.kwftype(typeof(select!)),NamedTuple{(:renamecols,), Tuple{Bool}},typeof(select!),DataFrame,Any,Any}) Base.precompile(Tuple{typeof(push!),DataFrame,Dict{Symbol, Any}}) Base.precompile(Tuple{typeof(_transformation_helper),DataFrame,Base.OneTo{Int},Base.RefValue{Any}}) Base.precompile(Tuple{typeof(_semijoin_sorted),OnCol{Tuple{Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}}, 3},OnCol{Tuple{PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}}, 3},BitVector}) diff --git a/test/select.jl b/test/select.jl index 6534c19c76..67fbc162fd 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1080,13 +1080,17 @@ end df2 = transform(df, [:x1, :x2] => +, :x2 => :x3, copycols=false) @test df2 == select(df, :, [:x1, :x2] => +, :x2 => :x3) - @test df.x2 === df2.x2 === df2.x3 + @test df.x2 == df2.x2 == df2.x3 + @test df.x2 === df2.x2 + @test df.x2 !== df2.x3 @test_throws ArgumentError transform(view(df, :, :), [:x1, :x2] => +, :x2 => :x3, copycols=false) x2 = df.x2 transform!(df, [:x1, :x2] => +, :x2 => :x3) @test df == df2 - @test x2 === df.x2 === df.x3 + @test x2 == df.x2 == df.x3 + @test x2 === df.x2 + @test x2 !== df.x3 @test transform(df) == df df2 = transform(df, copycols=false) @@ -1620,4 +1624,28 @@ end [:a] => sum => ["new"], false) == (1 => (sum => [:new])) end +@testset "copying in transform when renaming" begin + for oldcol in (:a, "a", 1), newcol in (:b, "b") + df = DataFrame(a=1) + df2 = transform(df, oldcol => newcol) + @test df2.b == df2.a == df.a + @test df2.b !== df2.a + @test df2.b !== df.a + @test df2.a !== df.a + + df2 = transform(df, oldcol => newcol, copycols=false) + @test df2.b == df2.a == df.a + @test df2.b !== df2.a + @test df2.b !== df.a + @test df2.a === df.a + + a = df.a + transform!(df, oldcol => newcol) + @test df.b == df.a == a + @test df.b !== df.a + @test df.b !== a + @test df.a === a + end +end + end # module