Skip to content

Commit

Permalink
add cols to mapcols and mapcols! (#3386)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Oct 17, 2023
1 parent 87c2162 commit 1a5da8a
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 17 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
column names only to a subset of the columns specified by the `cols`
keyword argument
([#3380](https://github.com/JuliaData/DataFrames.jl/pull/3380))
* `mapcols` and `mapcols!` now allow to apply a function transforming
columns only to a subset of the columns specified by the `cols`
keyword argument
([#3386](https://github.com/JuliaData/DataFrames.jl/pull/3386))

## Bug fixes

Expand Down
78 changes: 61 additions & 17 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,20 @@ as a `DataFrameRows` over a view of rows of parent of `dfr`.
julia> collect(Iterators.partition(eachrow(DataFrame(x=1:5)), 2))
3-element Vector{DataFrames.DataFrameRows{SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}}}}:
2×1 DataFrameRows
Row │ x
│ Int64
Row │ x
│ Int64
─────┼───────
1 │ 1
2 │ 2
2×1 DataFrameRows
Row │ x
│ Int64
Row │ x
│ Int64
─────┼───────
1 │ 3
2 │ 4
1×1 DataFrameRows
Row │ x
│ Int64
Row │ x
│ Int64
─────┼───────
1 │ 5
```
Expand Down Expand Up @@ -408,12 +408,17 @@ Base.show(dfcs::DataFrameColumns;
summary=summary, eltypes=eltypes, truncate=truncate, kwargs...)

"""
mapcols(f::Union{Function, Type}, df::AbstractDataFrame)
mapcols(f::Union{Function, Type}, df::AbstractDataFrame; cols=All())
Return a `DataFrame` where each column of `df` selected by `cols` (by default, all columns)
is transformed using function `f`.
Columns not selected by `cols` are copied.
Return a `DataFrame` where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars
(all values other than `AbstractVector` are considered to be a scalar).
The `cols` column selector can be any value accepted as column selector by the `names` function.
Note that `mapcols` guarantees not to reuse the columns from `df` in the returned
`DataFrame`. If `f` returns its argument then it gets copied before being stored.
Expand All @@ -440,15 +445,32 @@ julia> mapcols(x -> x.^2, df)
2 │ 4 144
3 │ 9 169
4 │ 16 196
julia> mapcols(x -> x.^2, df, cols=r"y")
4×2 DataFrame
Row │ x y
│ Int64 Int64
─────┼──────────────
1 │ 1 121
2 │ 2 144
3 │ 3 169
4 │ 4 196
```
"""
function mapcols(f::Union{Function, Type}, df::AbstractDataFrame)
function mapcols(f::Union{Function, Type}, df::AbstractDataFrame; cols=All())
if cols === All() || cols === Colon()
apply = Iterators.repeated(true)
else
picked = Set(names(df, cols))
apply = Bool[name in picked for name in names(df)]
end

# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
for (v, doapply) in zip(eachcol(df), apply)
fv = doapply ? f(v) : copy(v)
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
Expand All @@ -470,9 +492,12 @@ function mapcols(f::Union{Function, Type}, df::AbstractDataFrame)
end

"""
mapcols!(f::Union{Function, Type}, df::DataFrame)
mapcols!(f::Union{Function, Type}, df::DataFrame; cols=All())
Update a `DataFrame` in-place where each column of `df` selected by `cols` (by default, all columns)
is transformed using function `f`.
Columns not selected by `cols` are left unchanged.
Update a `DataFrame` in-place where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars
(all values other than `AbstractVector` are considered to be a scalar).
Expand Down Expand Up @@ -503,20 +528,39 @@ julia> df
2 │ 4 144
3 │ 9 169
4 │ 16 196
julia> mapcols!(x -> 2 * x, df, cols=r"x");
julia> df
4×2 DataFrame
Row │ x y
│ Int64 Int64
─────┼──────────────
1 │ 2 121
2 │ 8 144
3 │ 18 169
4 │ 32 196
```
"""
function mapcols!(f::Union{Function, Type}, df::DataFrame)
# note: `f` must return a consistent length
function mapcols!(f::Union{Function,Type}, df::DataFrame; cols=All())
if ncol(df) == 0 # skip if no columns
_drop_all_nonnote_metadata!(df)
return df
end

if cols === All() || cols === Colon()
apply = Iterators.repeated(true)
else
picked = Set(names(df, cols))
apply = Bool[name in picked for name in names(df)]
end

# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
for (v, doapply) in zip(eachcol(df), apply)
fv = doapply ? f(v) : v
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
Expand Down
25 changes: 25 additions & 0 deletions test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,19 @@ end
df = mapcols(x -> 2:2, df)
@test df == DataFrame(a=2)
@test df.a isa Vector{Int}

df = DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols(x -> 2x, df, cols=r"a") == DataFrame(a1=[2, 4], a2=[4, 6], b=[3, 4])
@test mapcols(x -> 2x, df, cols="b") == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols(x -> 2x, df, cols=Not(r"a")) == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols(x -> 2x, df, cols=Int) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])
@test mapcols(x -> 2x, df, cols=Not(All())) == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols(x -> 2x, df, cols=:) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])

df2 = mapcols(x -> 2x, df, cols="b")
@test df2.a1 == df.a1 && df2.a1 !== df.a1
@test df2.a2 == df.a2 && df2.a2 !== df.a2
@test df2.b == 2*df.b
end

@testset "mapcols!" begin
Expand Down Expand Up @@ -109,6 +122,18 @@ end
mapcols!(x -> 2:2, df)
@test df == DataFrame(a=2)
@test df.a isa Vector{Int}

df = DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols!(x -> 2x, copy(df), cols=r"a") == DataFrame(a1=[2, 4], a2=[4, 6], b=[3, 4])
@test mapcols!(x -> 2x, copy(df), cols="b") == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols!(x -> 2x, copy(df), cols=Not(r"a")) == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols!(x -> 2x, copy(df), cols=Int) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])
@test mapcols!(x -> 2x, copy(df), cols=Not(All())) == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols!(x -> 2x, copy(df), cols=:) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])
a1, a2, b = eachcol(df)
mapcols!(x -> 2x, df, cols=Not(All()))
@test df == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test df.a1 === a1 && df.a2 === a2 && df.b === b
end

@testset "SubDataFrame" begin
Expand Down

0 comments on commit 1a5da8a

Please sign in to comment.