diff --git a/NEWS.md b/NEWS.md index c0f0a3ce74..68612ec5ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,9 @@ choose the fast path only when it is safe; this resolves inconsistencies with what the same functions not using fast path produce ([#2357](https://github.com/JuliaData/DataFrames.jl/pull/2357)) +* in `describe` the specification of custom aggregation is now `function => name`; + old `name => function` order is now deprecated + ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401)) ## New functionalities diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 51a69236b1..b1964f3d1a 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -475,7 +475,7 @@ where each row represents a variable and each column a summary statistic. `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, `:max`, `:nmissing`, and `:eltype`. - `:all` as the only `Symbol` argument to return all statistics. - - A `name => function` pair where `name` is a `Symbol` or string. This will + - A `function => name` pair where `name` is a `Symbol` or string. This will create a column of summary statistics with the provided name. - `cols` : a keyword argument allowing to select only a subset of columns from `df` to describe. Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). @@ -521,7 +521,7 @@ julia> describe(df, :min, :max) │ 2 │ x │ 0.1 │ 1.0 │ │ 3 │ y │ 'a' │ 'j' │ -julia> describe(df, :min, :sum => sum) +julia> describe(df, :min, sum => :sum) 3×3 DataFrame │ Row │ variable │ min │ sum │ │ │ Symbol │ Any │ Any │ @@ -530,7 +530,7 @@ julia> describe(df, :min, :sum => sum) │ 2 │ x │ 0.1 │ 5.5 │ │ 3 │ y │ 'a' │ │ -julia> describe(df, :min, :sum => sum, cols=:x) +julia> describe(df, :min, sum => :sum, cols=:x) 1×3 DataFrame │ Row │ variable │ min │ sum │ │ │ Symbol │ Float64 │ Float64 │ @@ -538,11 +538,16 @@ julia> describe(df, :min, :sum => sum, cols=:x) │ 1 │ x │ 0.1 │ 5.5 │ ``` """ -DataAPI.describe(df::AbstractDataFrame, - stats::Union{Symbol, Pair{<:SymbolOrString}}...; - cols=:) = - _describe(select(df, cols, copycols=false), collect(stats)) - +function DataAPI.describe(df::AbstractDataFrame, stats::Union{Symbol, + Pair{<:Base.Callable,<:SymbolOrString}, + Pair{<:SymbolOrString}}...; # TODO: remove after deprecation + cols=:) + if any(x -> x isa Pair{<:SymbolOrString}, stats) + Base.depwarn("name => function order is deprecated; use function => name instead", :describe) + end + return _describe(select(df, cols, copycols=false), + Any[s isa Pair{<:SymbolOrString} ? last(s) => first(s) : s for s in stats]) +end DataAPI.describe(df::AbstractDataFrame; cols=:) = _describe(select(df, cols, copycols=false), [:mean, :min, :median, :max, :nmissing, :eltype]) @@ -565,9 +570,9 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector) throw(ArgumentError(":$not_allowed not allowed." * allowed_msg)) end - custom_funs = Pair[Symbol(s[1]) => s[2] for s in stats if s isa Pair] + custom_funs = Pair[s[1] => Symbol(s[2]) for s in stats if s isa Pair] - ordered_names = [s isa Symbol ? s : Symbol(first(s)) for s in stats] + ordered_names = [s isa Symbol ? s : Symbol(last(s)) for s in stats] if !allunique(ordered_names) df_ord_names = DataFrame(ordered_names = ordered_names) @@ -662,7 +667,7 @@ end function get_stats!(d::Dict, col::AbstractVector, stats::AbstractVector{<:Pair}) for stat in stats - d[stat[1]] = try stat[2](col) catch end + d[stat[2]] = try stat[1](col) catch end end end diff --git a/test/dataframe.jl b/test/dataframe.jl index 9e9adebf8e..cb78cac45c 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -670,9 +670,9 @@ end describe_output.test_std = describe_output.std # Test that describe works with a Pair and a symbol @test describe_output[:, [:variable, :mean, :test_std]] ≅ - describe(df, :mean, :test_std => std) + describe(df, :mean, std => :test_std) @test describe_output[:, [:variable, :mean, :test_std]] ≅ - describe(df, :mean, "test_std" => std) + describe(df, :mean, std => "test_std") # Test that describe works with a dataframe with no observations df = DataFrame(a = Int[], b = String[], c = []) @@ -683,6 +683,9 @@ end @test describe(df, cols=Not(1)) ≅ describe(select(df, Not(1))) @test describe(df, cols=Not("a")) ≅ describe(select(df, Not(1))) + @test describe(DataFrame(a=[1,2]), cols = :a, :min, minimum => :min2, maximum => "max2", :max) == + DataFrame(variable=:a, min=1, min2=1, max2=2, max=2) + @test_throws ArgumentError describe(df, :mean, :all) end diff --git a/test/deprecated.jl b/test/deprecated.jl index 189df3d9f8..20b4600dfb 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -226,4 +226,9 @@ end categorical!(df, Between(1,2)) end +@testset "deprecated describe syntax" begin + @test describe(DataFrame(a=[1,2]), cols = :a, :min, :min2 => minimum, "max2" => maximum, :max) == + DataFrame(variable=:a, min=1, min2=1, max2=2, max=2) +end + end # module