JuliaData · bkamins · Apr 16, 2021 · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021
diff --git a/NEWS.md b/NEWS.md
@@ -11,6 +11,11 @@
 * `mapcols!` makes sure not to create columns being `AbstractRange` consistently
   with other methods that add columns to a `DataFrame`
   ([#2594](https://github.com/JuliaData/DataFrames.jl/pull/2594))
+* `transform` and `transform!` always copy columns when column renaming transformation
+  is passed. If similar issues are identified after 1.0 release (i.e. that a
+  copy of data is not made in scenarios where it normally should be made these
+  will be considered bugs and fixed as non-breaking changes)
+  ([#2721](https://github.com/JuliaData/DataFrames.jl/pull/2721))
 
 ## New functionalities
 

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -138,6 +138,8 @@ const TRANSFORMATION_COMMON_RULES =
     transformations that return the same object without copying may create
     column aliases even if `copycols=true`. An example of such a situation is
     `select!(df, :a, :a => :b, :a => identity => :c)`.
+    As a special case in `transform` and `transform!` column renaming always
+    copies columns to avoid storing aliased columns in the target data frame.
 
     If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is
     returned and the same copying rules apply as for a `DataFrame` input: this
@@ -270,14 +272,14 @@ function normalize_selection(idx::AbstractIndex,
         allunique(combine_target_col) || throw(ArgumentError("target column names must be unique"))
     end
 
-    if wanttable 
-        combine_src = AsTable(c) 
-    else 
+    if wanttable
+        combine_src = AsTable(c)
+    else
         combine_src = (length(c) == 1 ? only(c) : c)
     end
 
     combine_func = first(last(sel))
-    
+
     return combine_src => combine_func => combine_target_col
 end
 
@@ -338,9 +340,9 @@ function normalize_selection(idx::AbstractIndex,
         end
     end
 
-    if wanttable 
-        combine_src = AsTable(c) 
-    else 
+    if wanttable
+        combine_src = AsTable(c)
+    else
         combine_src = (length(c) == 1 ? only(c) : c)
     end
 
@@ -656,8 +658,17 @@ $TRANSFORMATION_COMMON_RULES
 
 See [`select`](@ref) for examples.
 """
-transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) =
-    select!(df, :, args..., renamecols=renamecols)
+function transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true)
+    idx = index(df)
+    newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol}
+                      idx[first(sel)] => copy => last(sel)
+                  elseif sel isa Pair{<:ColumnIndex, <:AbstractString}
+                      idx[first(sel)] => copy => Symbol(last(sel))
+                  else
+                      sel
+                  end for sel in args]
+    return select!(df, :, newargs..., renamecols=renamecols)
+end
 
 function transform!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
@@ -978,8 +989,20 @@ ERROR: ArgumentError: column :x in returned data frame is not equal to grouping
 
 See [`select`](@ref) for more examples.
 """
-transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) =
-    select(df, :, args..., copycols=copycols, renamecols=renamecols)
+function transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true)
+    idx = index(df)
+    # when using the copy function the copy of source data frame
+    # is made exactly once even if copycols=true
+    # (copycols=true makes a copy only if the column was not copied previously)
+    newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol}
+                      idx[first(sel)] => copy => last(sel)
+                  elseif sel isa Pair{<:ColumnIndex, <:AbstractString}
+                      idx[first(sel)] => copy => Symbol(last(sel))
+                  else
+                      sel
+                  end for sel in args]
+    return select(df, :, newargs..., copycols=copycols, renamecols=renamecols)
+end
 
 function transform(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon

diff --git a/src/other/precompile.jl b/src/other/precompile.jl
@@ -134,6 +134,9 @@ function precompile(all=false)
             end
         Base.precompile(Tuple{typeof(row_group_slots),Tuple{RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, String}}, Vector{Union{Missing, Int}}},Val{false},Vector{Int},Bool,Bool})
         Base.precompile(Tuple{typeof(combine),Union{Function, Type},GroupedDataFrame{DataFrame}})
+        Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}})
+        Base.precompile(Tuple{typeof(transform!),DataFrame,Any})
+        Base.precompile(Tuple{typeof(transform),DataFrame,Any})
     else
         Base.precompile(Tuple{typeof(_sortperm),DataFrame,Base.Sort.MergeSortAlg,DFPerm{Vector{Ordering}, Tuple{Vector{Int}, Vector{Int}, Vector{Int}}}})
         Base.precompile(Tuple{typeof(flatten),DataFrame,All{Tuple{}}})
@@ -1030,6 +1033,10 @@ function precompile(all=false)
             end
         Base.precompile(Tuple{typeof(row_group_slots),Tuple{Vector{Float64}},Val{false},Vector{Int},Bool,Bool})
         Base.precompile(Tuple{typeof(transform),DataFrame,Any})
+        Base.precompile(Tuple{typeof(transform!),DataFrame,Any})
+        Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}})
+        Base.precompile(Tuple{Core.kwftype(typeof(select)),NamedTuple{(:copycols, :renamecols), Tuple{Bool, Bool}},typeof(select),DataFrame,Any,Any})
+        Base.precompile(Tuple{Core.kwftype(typeof(select!)),NamedTuple{(:renamecols,), Tuple{Bool}},typeof(select!),DataFrame,Any,Any})
         Base.precompile(Tuple{typeof(push!),DataFrame,Dict{Symbol, Any}})
         Base.precompile(Tuple{typeof(_transformation_helper),DataFrame,Base.OneTo{Int},Base.RefValue{Any}})
         Base.precompile(Tuple{typeof(_semijoin_sorted),OnCol{Tuple{Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}}, 3},OnCol{Tuple{PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}}, 3},BitVector})

diff --git a/test/select.jl b/test/select.jl
@@ -1080,13 +1080,17 @@ end
 
     df2 = transform(df, [:x1, :x2] => +, :x2 => :x3, copycols=false)
     @test df2 == select(df, :, [:x1, :x2] => +, :x2 => :x3)
-    @test df.x2 === df2.x2 === df2.x3
+    @test df.x2 == df2.x2 == df2.x3
+    @test df.x2 === df2.x2
+    @test df.x2 !== df2.x3
     @test_throws ArgumentError transform(view(df, :, :), [:x1, :x2] => +, :x2 => :x3, copycols=false)
 
     x2 = df.x2
     transform!(df, [:x1, :x2] => +, :x2 => :x3)
     @test df == df2
-    @test x2 === df.x2 === df.x3
+    @test x2 == df.x2 == df.x3
+    @test x2 === df.x2
+    @test x2 !== df.x3
 
     @test transform(df) == df
     df2 = transform(df, copycols=false)
@@ -1620,4 +1624,28 @@ end
                                          [:a] => sum => ["new"],  false) == (1 => (sum => [:new]))
 end
 
+@testset "copying in transform when renaming" begin
+    for oldcol in (:a, "a", 1), newcol in (:b, "b")
+        df = DataFrame(a=1)
+        df2 = transform(df, oldcol => newcol)
+        @test df2.b == df2.a == df.a
+        @test df2.b !== df2.a
+        @test df2.b !== df.a
+        @test df2.a !== df.a
+
+        df2 = transform(df, oldcol => newcol, copycols=false)
+        @test df2.b == df2.a == df.a
+        @test df2.b !== df2.a
+        @test df2.b !== df.a
+        @test df2.a === df.a
+
+        a = df.a
+        transform!(df, oldcol => newcol)
+        @test df.b == df.a == a
+        @test df.b !== df.a
+        @test df.b !== a
+        @test df.a === a
+    end
+end
+
 end # module