diff --git a/NEWS.md b/NEWS.md index a9e6199b0e..407ba1603a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# DataFrames.jl v1.4 Release Notes + +## New functionalities + +* `unstack` now allows passing a function in `valuestransform` keyword argument; + this allows for a convenient creation of two dimensional pivot tables + ([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998)) + # DataFrames.jl v1.3.2 Patch Release Notes ## Bug fixes diff --git a/Project.toml b/Project.toml index 9f3b998906..642fae2b13 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DataFrames" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.3.2" +version = "1.4.0" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 38068c89e0..7fb41c493b 100755 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -296,9 +296,8 @@ This is provides a view of the original columns stacked together. Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. -None of these reshaping functions perform any aggregation. To do aggregation, -use the split-apply-combine functions in combination with reshaping. Here is an -example: +To do aggregation, use the split-apply-combine functions in combination with +`unstack` or use the `valuestransform` keyword argument in `unstack`. Here is an example: ```jldoctest reshape julia> using Statistics @@ -326,9 +325,9 @@ julia> d = stack(iris, Not(:Species)) 750 │ Iris-virginica id 150.0 735 rows omitted -julia> x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum) +julia> agg = combine(groupby(d, [:variable, :Species]), :value => mean => :vmean) 15×3 DataFrame - Row │ variable Species vsum + Row │ variable Species vmean │ String String15 Float64 ─────┼─────────────────────────────────────── 1 │ SepalLength Iris-setosa 5.006 @@ -347,7 +346,18 @@ julia> x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum) 14 │ id Iris-versicolor 75.5 15 │ id Iris-virginica 125.5 -julia> first(unstack(x, :Species, :vsum), 6) +julia> unstack(agg, :variable, :Species, :vmean) +5×4 DataFrame + Row │ variable Iris-setosa Iris-versicolor Iris-virginica + │ String Float64? Float64? Float64? +─────┼─────────────────────────────────────────────────────────── + 1 │ SepalLength 5.006 5.936 6.588 + 2 │ SepalWidth 3.418 2.77 2.974 + 3 │ PetalLength 1.464 4.26 5.552 + 4 │ PetalWidth 0.244 1.326 2.026 + 5 │ id 25.5 75.5 125.5 + +julia> unstack(d, :variable, :Species, :value, valuestransform=mean) 5×4 DataFrame Row │ variable Iris-setosa Iris-versicolor Iris-virginica │ String Float64? Float64? Float64? diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 6f187b9905..b1158f8932 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -197,12 +197,18 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int}, end """ - unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) - unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) - unstack(df::AbstractDataFrame; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) + unstack(df::AbstractDataFrame, rowkeys, colkey, value; + renamecols::Function=identity, allowmissing::Bool=false, + allowduplicates::Bool=false, valuestransform=nothing, + fill=missing) + unstack(df::AbstractDataFrame, colkey, value; + renamecols::Function=identity, allowmissing::Bool=false, + allowduplicates::Bool=false, valuestransform=nothing, + fill=missing) + unstack(df::AbstractDataFrame; + renamecols::Function=identity, allowmissing::Bool=false, + allowduplicates::Bool=false, valuestransform=nothing, + fill=missing) Unstack data frame `df`, i.e. convert it from long to wide format. @@ -210,29 +216,33 @@ Row and column keys will be ordered in the order of their first appearance. # Positional arguments - `df` : the AbstractDataFrame to be unstacked -- `rowkeys` : the columns with a unique key for each row, if not given, - find a key by grouping on anything not a `colkey` or `value`. - Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). - If `rowkeys` contains no columns all rows are assumed to have the same key. -- `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide format, - defaults to `:variable` -- `value` : the value column ($COLUMNINDEX_STR), defaults to `:value` +- `rowkeys` : the columns with a unique key for each row, if not given, find a + key by grouping on anything not a `colkey` or `value`. Can be any column + selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `rowkeys` contains no + columns all rows are assumed to have the same key. +- `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide + format, defaults to `:variable` +- `values` : the column storing values ($COLUMNINDEX_STR), defaults to `:value` # Keyword arguments -- `renamecols`: a function called on each unique value in `colkey`; it must return - the name of the column to be created (typically as a string or a `Symbol`). - Duplicates in resulting names when converted to `Symbol` are not allowed. - By default no transformation is performed. -- `allowmissing`: if `false` (the default) then an error will be thrown if `colkey` - contains `missing` values; if `true` then a column referring to `missing` value - will be created. -- `allowduplicates`: if `false` (the default) then an error an error will be thrown - if combination of `rowkeys` and `colkey` contains duplicate entries; if `true` - then then the last encountered `value` will be retained. -- `fill`: missing row/column combinations are filled with this value. The default - is `missing`. If the `value` column is a `CategoricalVector` and `fill` - is not `missing` then in order to keep unstacked value columns also +- `renamecols`: a function called on each unique value in `colkey`; it must + return the name of the column to be created (typically as a string or a + `Symbol`). Duplicates in resulting names when converted to `Symbol` are not + allowed. By default no transformation is performed. +- `allowmissing`: if `false` (the default) then an error will be thrown if + `colkey` contains `missing` values; if `true` then a column referring to + `missing` value will be created. +- `allowduplicates`: if `false` (the default) then an error an error will be + thrown if combination of `rowkeys` and `colkey` contains duplicate entries; if + `true` then the last encountered `value` will be retained; + this keyword argument is ignored if `valuestransform` keyword argument is passed. +- `valuestransform`: if passed then `allowduplicates` is ignored and instead + the passed function will be called on a vector view containing all elements + for each combination of `rowkeys` and `colkey` present in the data. +- `fill`: missing row/column combinations are filled with this value. The + default is `missing`. If the `value` column is a `CategoricalVector` and + `fill` is not `missing` then in order to keep unstacked value columns also `CategoricalVector` the `fill` must be passed as `CategoricalValue` # Examples @@ -336,7 +346,10 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) 4 │ 4 2.0 1.0 2.0 5 │ 5 2.0 1.0 3.0 6 │ 6 2.0 1.0 3.0 +``` +Note that there are some differences between the widened results above. +```jldoctest julia> df = DataFrame(id=["1", "1", "2"], variable=["Var1", "Var2", "Var1"], value=[1, 2, 3]) @@ -355,35 +368,87 @@ julia> unstack(df, :variable, :value, fill=0) ─────┼────────────────────── 1 │ 1 1 2 2 │ 2 3 0 + +julia> df = DataFrame(cols=["a", "a", "b"], values=[1, 2, 4]) +3×2 DataFrame + Row │ cols values + │ String Int64 +─────┼──────────────── + 1 │ a 1 + 2 │ a 2 + 3 │ b 4 + +julia> unstack(df, :cols, :values, valuestransform=copy) +1×2 DataFrame + Row │ a b + │ Array…? Array…? +─────┼────────────────── + 1 │ [1, 2] [4] + +julia> unstack(df, :cols, :values, valuestransform=sum) +1×2 DataFrame + Row │ a b + │ Int64? Int64? +─────┼──────────────── + 1 │ 3 4 ``` -Note that there are some differences between the widened results above. """ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, - value::ColumnIndex; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) - rowkey_ints = vcat(index(df)[rowkeys]) + values::ColumnIndex; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false, + valuestransform=nothing, fill=missing) + if !isnothing(valuestransform) + groupcols = vcat(index(df)[rowkeys], index(df)[colkey]) + @assert groupcols isa AbstractVector{Int} + gdf = groupby(df, groupcols) + if check_aggregate(valuestransform, df[!, values]) isa AbstractAggregate + # if valuestransform function is AbstractAggregate + # then we are sure it will return a scalar number so we can + # leave it as is and be sure we use fast path in combine + agg_fun = valuestransform + else + # in general valuestransform function could return e.g. a vector, + # which would get expanded to multiple rows so we protect it with + # Ref that will get unwrapped by combine + agg_fun = Ref∘valuestransform + end + df_op = combine(gdf, values => agg_fun, renamecols=false) + group_rows = find_group_row(gdf) + if !issorted(group_rows) + df_op = df_op[sortperm(group_rows), :] + end + # set allowduplicates to true as we should not have any duplicates now + # and allowduplicates=true is a bit faster + allowduplicates = true + else + df_op = df + end + # use df_op below to make sure it is type stable + rowkey_ints = vcat(index(df_op)[rowkeys]) @assert rowkey_ints isa AbstractVector{Int} - g_rowkey = groupby(df, rowkey_ints) - g_colkey = groupby(df, colkey) - valuecol = df[!, value] - return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey, + g_rowkey = groupby(df_op, rowkey_ints) + g_colkey = groupby(df_op, colkey) + valuecol = df_op[!, values] + return _unstack(df_op, rowkey_ints, index(df_op)[colkey], g_colkey, valuecol, g_rowkey, renamecols, allowmissing, allowduplicates, fill) end -function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex; +function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) + allowmissing::Bool=false, allowduplicates::Bool=false, + valuestransform=nothing, fill=missing) colkey_int = index(df)[colkey] - value_int = index(df)[value] + value_int = index(df)[values] return unstack(df, Not(colkey_int, value_int), colkey_int, value_int, renamecols=renamecols, allowmissing=allowmissing, - allowduplicates=allowduplicates, fill=fill) + allowduplicates=allowduplicates, valuestransform=valuestransform, fill=fill) end unstack(df::AbstractDataFrame; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) = + allowmissing::Bool=false, allowduplicates::Bool=false, + valuestransform=nothing, fill=missing) = unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, - allowduplicates=allowduplicates, fill=fill) + allowduplicates=allowduplicates, valuestransform=valuestransform, fill=fill) # we take into account the fact that idx, starts and ends are computed lazily # so we rather directly reference the gdf.groups @@ -410,8 +475,8 @@ end function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, colkey::Int, g_colkey::GroupedDataFrame, valuecol::AbstractVector, g_rowkey::GroupedDataFrame, - renamecols::Function, - allowmissing::Bool, allowduplicates::Bool, fill) + renamecols::Function, allowmissing::Bool, + allowduplicates::Bool, fill) rowref = g_rowkey.groups row_group_row_idxs = find_group_row(g_rowkey) Nrow = length(g_rowkey) @@ -425,22 +490,31 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " * "Pass `allowmissing=true` to skip missings.")) end - unstacked_val = [fill!(similar(valuecol, - promote_type(eltype(valuecol), typeof(fill)), - Nrow), - fill) for _ in 1:Ncol] - - mask_filled = falses(Nrow, Ncol) - @assert length(rowref) == length(colref) == length(valuecol) - for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) - if !allowduplicates && mask_filled[row_id, col_id] - throw(ArgumentError("Duplicate entries in unstack at row $k for key "* - "$(tuple((df[k, s] for s in rowkeys)...)) and variable $(colref_map[col_id]). " * - "Pass allowduplicates=true to allow them.")) + + unstacked_val = [fill!(similar(valuecol, + promote_type(eltype(valuecol), typeof(fill)), + Nrow), + fill) for _ in 1:Ncol] + + # use a separate path for allowduplicates to reduce memory use and increase speed + if allowduplicates + for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) + unstacked_val[col_id][row_id] = val + end + else + mask_filled = falses(Nrow, Ncol) + for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) + if mask_filled[row_id, col_id] + bad_key = tuple((df[k, s] for s in rowkeys)...) + bad_var = colref_map[col_id] + throw(ArgumentError("Duplicate entries in unstack at row $k for key "* + "$bad_key and variable $bad_var. " * + "Pass allowduplicates=true to allow them.")) + end + unstacked_val[col_id][row_id] = val + mask_filled[row_id, col_id] = true end - unstacked_val[col_id][row_id] = val - mask_filled[row_id, col_id] = true end # note that Symbol(renamecols(x)) must produce unique column names @@ -458,7 +532,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, res_df = hcat(df1, df2, copycols=false) @assert length(row_group_row_idxs) == nrow(res_df) - # avoid reordering when col_group_row_idxs was already ordered + # avoid reordering when row_group_row_idxs was already ordered if !issorted(row_group_row_idxs) res_df = res_df[sortperm(row_group_row_idxs), :] end diff --git a/test/reshape.jl b/test/reshape.jl index 288835219e..2cae3029a5 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -737,6 +737,89 @@ end df = DataFrame(x=[:one, :two, :one], y=[1, 2, 3]) @test_throws ArgumentError unstack(df, :x, :y) @test unstack(df, :x, :y, allowduplicates=true) == DataFrame(one=3, two=2) + @test unstack(df, :x, :y, valuestransform=identity) == + DataFrame(one=[[1, 3]], two=[[2]]) + @test unstack(df, :x, :y, valuestransform=last) == + DataFrame(one=3, two=2) + @test unstack(df, :x, :y, valuestransform=first) == + DataFrame(one=1, two=2) + @test unstack(df, :x, :y, valuestransform=length) == + DataFrame(one=2, two=1) +end + +@testset "valuestransform" begin + df = DataFrame(rowid=[1, 1, 1, 1, 2, 2], colid=[1, 1, 2, 2, 3, 3], values=1:6) + @test_throws ArgumentError unstack(df, :rowid, :colid, :values) + @test unstack(df, :rowid, :colid, :values, allowduplicates=true) ≅ + DataFrame("rowid" => 1:2, "1" => [2, missing], + "2" => [4, missing], "3" => [missing, 6]) + @test unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) == + DataFrame("rowid" => 1:2, "1" => [2, 0], + "2" => [4, 0], "3" => [0, 6]) + @test unstack(df, :rowid, :colid, :values, valuestransform=identity) ≅ + DataFrame("rowid" => 1:2, "1" => [1:2, missing], + "2" => [3:4, missing], "3" => [missing, 5:6]) + @test unstack(df, :rowid, :colid, :values, + valuestransform=identity, fill=Int[]) == + DataFrame("rowid" => 1:2, "1" => [1:2, []], + "2" => [3:4, []], "3" => [[], 5:6]) + @test unstack(df, :rowid, :colid, :values, valuestransform=sum) ≅ + DataFrame("rowid" => 1:2, "1" => [3, missing], + "2" => [7, missing], "3" => [missing, 11]) + @test unstack(df, :rowid, :colid, :values, valuestransform=sum, fill=0) == + DataFrame("rowid" => 1:2, "1" => [3, 0], + "2" => [7, 0], "3" => [0, 11]) + @test unstack(df, :rowid, :colid, :values, valuestransform=sum, fill="X") == + DataFrame("rowid" => 1:2, "1" => [3, "X"], + "2" => [7, "X"], "3" => ["X", 11]) + @test unstack(df, :rowid, :colid, :values, valuestransform=length) ≅ + DataFrame("rowid" => 1:2, "1" => [2, missing], + "2" => [2, missing], "3" => [missing, 2]) + @test unstack(df, :rowid, :colid, :values, valuestransform=length, fill=0) == + DataFrame("rowid" => 1:2, "1" => [2, 0], + "2" => [2, 0], "3" => [0, 2]) + @test unstack(df, :rowid, :colid, :values, + valuestransform=x -> isempty(x) ? missing : length(x)) ≅ + DataFrame("rowid" => 1:2, "1" => [2, missing], + "2" => [2, missing], "3" => [missing, 2]) + @test unstack(df, :rowid, :colid, :values, + valuestransform=x -> isempty(x) ? missing : x) ≅ + DataFrame("rowid" => 1:2, "1" => [1:2, missing], + "2" => [3:4, missing], "3" => [missing, 5:6]) + + df = DataFrame(rowid=[2, 2, 2, 2, 1, 1], colid=[2, 2, 1, 1, 3, 3], values=1:6) + @test unstack(df, :rowid, :colid, :values, valuestransform=identity) ≅ + DataFrame("rowid" => [2,1], "2" => [1:2, missing], + "1" => [3:4, missing], "3" => [missing, 5:6]) + @test unstack(df, :rowid, :colid, :values, valuestransform=identity, fill="X") == + DataFrame("rowid" => [2,1], "2" => [1:2, "X"], + "1" => [3:4, "X"], "3" => ["X", 5:6]) + + Random.seed!(1234) + # check correctness of row and column ordering + for _ in 1:10 + df = DataFrame(rowid=rand(1:10, 50), colid=rand(1:10, 50), values=1:50) + res = unstack(df, :rowid, :colid, :values, valuestransform=last) + @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true) + @test res.rowid == unique(df.rowid) + @test names(res, Not(1)) == string.(unique(df.colid)) + res = unstack(df, :rowid, :colid, :values, valuestransform=last, fill=0) + @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) + @test res.rowid == unique(df.rowid) + @test names(res, Not(1)) == string.(unique(df.colid)) + + df.rowid=categorical(df.rowid, levels=shuffle(unique(df.rowid))) + df.colid=categorical(df.colid, levels=shuffle(unique(df.colid))) + res = unstack(df, :rowid, :colid, :values, valuestransform=last) + @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true) + @test unwrap.(res.rowid) == unique(df.rowid) + @test names(res, Not(1)) == string.(unique(df.colid)) + res = unstack(df, :rowid, :colid, :values, valuestransform=last, fill=0) + @test res ≅ + unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) + @test unwrap.(res.rowid) == unique(df.rowid) + @test names(res, Not(1)) == string.(unique(df.colid)) + end end end # module