Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow function in allowduplicates in unstack #2998

Merged
merged 11 commits into from
Feb 17, 2022
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# DataFrames.jl v1.3 Release Notes
bkamins marked this conversation as resolved.
Show resolved Hide resolved

## New functionalities

* `unstack` now allows passing a function in `valuestransform` keyword argument;
this allows for a convenient creation of two dimensional pivot tables
([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998))

# DataFrames.jl v1.3.2 Patch Release Notes

## Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DataFrames"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.3.2"
version = "1.4.0"

[deps]
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
Expand Down
22 changes: 16 additions & 6 deletions docs/src/man/reshaping_and_pivoting.md
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,8 @@ This is provides a view of the original columns stacked together.
Id columns -- `RepeatedVector`
This repeats the original columns N times where N is the number of columns stacked.

None of these reshaping functions perform any aggregation. To do aggregation,
use the split-apply-combine functions in combination with reshaping. Here is an
example:
To do aggregation, use the split-apply-combine functions in combination with
`unstack` or use the `valuestransform` keyword argument in `unstack`. Here is an example:

```jldoctest reshape
julia> using Statistics
Expand Down Expand Up @@ -326,9 +325,9 @@ julia> d = stack(iris, Not(:Species))
750 │ Iris-virginica id 150.0
735 rows omitted

julia> x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum)
julia> agg = combine(groupby(d, [:variable, :Species]), :value => mean => :vmean)
15×3 DataFrame
Row │ variable Species vsum
Row │ variable Species vmean
│ String String15 Float64
─────┼───────────────────────────────────────
1 │ SepalLength Iris-setosa 5.006
Expand All @@ -347,7 +346,18 @@ julia> x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum)
14 │ id Iris-versicolor 75.5
15 │ id Iris-virginica 125.5

julia> first(unstack(x, :Species, :vsum), 6)
julia> unstack(agg, :variable, :Species, :vmean)
5×4 DataFrame
Row │ variable Iris-setosa Iris-versicolor Iris-virginica
│ String Float64? Float64? Float64?
─────┼───────────────────────────────────────────────────────────
1 │ SepalLength 5.006 5.936 6.588
2 │ SepalWidth 3.418 2.77 2.974
3 │ PetalLength 1.464 4.26 5.552
4 │ PetalWidth 0.244 1.326 2.026
5 │ id 25.5 75.5 125.5

julia> unstack(d, :variable, :Species, :value, valuestransform=mean)
5×4 DataFrame
Row │ variable Iris-setosa Iris-versicolor Iris-virginica
│ String Float64? Float64? Float64?
Expand Down
188 changes: 131 additions & 57 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -197,42 +197,52 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int},
end

"""
unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
unstack(df::AbstractDataFrame; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
unstack(df::AbstractDataFrame, rowkeys, colkey, value;
renamecols::Function=identity, allowmissing::Bool=false,
allowduplicates::Bool=false, valuestransform=nothing,
fill=missing)
unstack(df::AbstractDataFrame, colkey, value;
renamecols::Function=identity, allowmissing::Bool=false,
allowduplicates::Bool=false, valuestransform=nothing,
fill=missing)
unstack(df::AbstractDataFrame;
renamecols::Function=identity, allowmissing::Bool=false,
allowduplicates::Bool=false, valuestransform=nothing,
fill=missing)

Unstack data frame `df`, i.e. convert it from long to wide format.

Row and column keys will be ordered in the order of their first appearance.

# Positional arguments
- `df` : the AbstractDataFrame to be unstacked
- `rowkeys` : the columns with a unique key for each row, if not given,
find a key by grouping on anything not a `colkey` or `value`.
Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
If `rowkeys` contains no columns all rows are assumed to have the same key.
- `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide format,
defaults to `:variable`
- `value` : the value column ($COLUMNINDEX_STR), defaults to `:value`
- `rowkeys` : the columns with a unique key for each row, if not given, find a
key by grouping on anything not a `colkey` or `value`. Can be any column
selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `rowkeys` contains no
columns all rows are assumed to have the same key.
- `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide
format, defaults to `:variable`
- `values` : the column storing values ($COLUMNINDEX_STR), defaults to `:value`

# Keyword arguments

- `renamecols`: a function called on each unique value in `colkey`; it must return
the name of the column to be created (typically as a string or a `Symbol`).
Duplicates in resulting names when converted to `Symbol` are not allowed.
By default no transformation is performed.
- `allowmissing`: if `false` (the default) then an error will be thrown if `colkey`
contains `missing` values; if `true` then a column referring to `missing` value
will be created.
- `allowduplicates`: if `false` (the default) then an error an error will be thrown
if combination of `rowkeys` and `colkey` contains duplicate entries; if `true`
then then the last encountered `value` will be retained.
- `fill`: missing row/column combinations are filled with this value. The default
is `missing`. If the `value` column is a `CategoricalVector` and `fill`
is not `missing` then in order to keep unstacked value columns also
- `renamecols`: a function called on each unique value in `colkey`; it must
return the name of the column to be created (typically as a string or a
`Symbol`). Duplicates in resulting names when converted to `Symbol` are not
allowed. By default no transformation is performed.
- `allowmissing`: if `false` (the default) then an error will be thrown if
`colkey` contains `missing` values; if `true` then a column referring to
`missing` value will be created.
- `allowduplicates`: if `false` (the default) then an error an error will be
thrown if combination of `rowkeys` and `colkey` contains duplicate entries; if
`true` then the last encountered `value` will be retained;
this keyword argument is ignored if `valuestransform` keyword argument is passed.
- `valuestransform`: if passed then `allowduplicates` is ignored and instead
the passed function will be called on a view vector containing all elements
for each non-missing combination of `rowkeys` and `colkey`.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
- `fill`: missing row/column combinations are filled with this value. The
default is `missing`. If the `value` column is a `CategoricalVector` and
`fill` is not `missing` then in order to keep unstacked value columns also
`CategoricalVector` the `fill` must be passed as `CategoricalValue`

# Examples
Expand Down Expand Up @@ -336,7 +346,10 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x))
4 │ 4 2.0 1.0 2.0
5 │ 5 2.0 1.0 3.0
6 │ 6 2.0 1.0 3.0
```
Note that there are some differences between the widened results above.

```jldoctest
julia> df = DataFrame(id=["1", "1", "2"],
variable=["Var1", "Var2", "Var1"],
value=[1, 2, 3])
Expand All @@ -355,35 +368,87 @@ julia> unstack(df, :variable, :value, fill=0)
─────┼──────────────────────
1 │ 1 1 2
2 │ 2 3 0

julia> df = DataFrame(cols=["a", "a", "b"], values=[1, 2, 4])
3×2 DataFrame
Row │ cols values
│ String Int64
─────┼────────────────
1 │ a 1
2 │ a 2
3 │ b 4

julia> unstack(df, :cols, :values, valuestransform=copy)
1×2 DataFrame
Row │ a b
│ Array…? Array…?
─────┼──────────────────
1 │ [1, 2] [4]

julia> unstack(df, :cols, :values, valuestransform=sum)
1×2 DataFrame
Row │ a b
│ Int64? Int64?
─────┼────────────────
1 │ 3 4
```
Note that there are some differences between the widened results above.
"""
function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
value::ColumnIndex; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
rowkey_ints = vcat(index(df)[rowkeys])
values::ColumnIndex; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false,
valuestransform=nothing, fill=missing)
if !isnothing(valuestransform)
groupcols = vcat(index(df)[rowkeys], index(df)[colkey])
@assert groupcols isa AbstractVector{Int}
gdf = groupby(df, groupcols)
if check_aggregate(valuestransform, df[!, values]) isa AbstractAggregate
# if valuestransform function is AbstractAggregate
# then we are sure it will return a scalar number so we can
# leave it as is and be sure we use fast path in combine
agg_fun = valuestransform
else
# in general valuestransform function could return e.g. a vector,
# which would get expanded to multiple rows so we protect it with
# Ref that will get unwrapped by combine
agg_fun = Ref∘valuestransform
end
df_op = combine(gdf, values => agg_fun, renamecols=false)
group_rows = find_group_row(gdf)
if !issorted(group_rows)
df_op = df_op[sortperm(group_rows), :]
end
# set allowduplicates to true as we should not have any duplicates now
# and allowduplicates=true is a bit faster
allowduplicates = true
else
df_op = df
end
bkamins marked this conversation as resolved.
Show resolved Hide resolved
# use df_op below to make sure it is type stable
rowkey_ints = vcat(index(df_op)[rowkeys])
@assert rowkey_ints isa AbstractVector{Int}
g_rowkey = groupby(df, rowkey_ints)
g_colkey = groupby(df, colkey)
valuecol = df[!, value]
return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey,
g_rowkey = groupby(df_op, rowkey_ints)
g_colkey = groupby(df_op, colkey)
valuecol = df_op[!, values]
return _unstack(df_op, rowkey_ints, index(df_op)[colkey], g_colkey,
valuecol, g_rowkey, renamecols, allowmissing, allowduplicates, fill)
end

function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex;
function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex;
renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
allowmissing::Bool=false, allowduplicates::Bool=false,
valuestransform=nothing, fill=missing)
colkey_int = index(df)[colkey]
value_int = index(df)[value]
value_int = index(df)[values]
return unstack(df, Not(colkey_int, value_int), colkey_int, value_int,
renamecols=renamecols, allowmissing=allowmissing,
allowduplicates=allowduplicates, fill=fill)
allowduplicates=allowduplicates, valuestransform=valuestransform, fill=fill)
end

unstack(df::AbstractDataFrame; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) =
allowmissing::Bool=false, allowduplicates::Bool=false,
valuestransform=nothing, fill=missing) =
unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
allowduplicates=allowduplicates, fill=fill)
allowduplicates=allowduplicates, valuestransform=valuestransform, fill=fill)

# we take into account the fact that idx, starts and ends are computed lazily
# so we rather directly reference the gdf.groups
Expand All @@ -410,8 +475,8 @@ end
function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
colkey::Int, g_colkey::GroupedDataFrame,
valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
renamecols::Function,
allowmissing::Bool, allowduplicates::Bool, fill)
renamecols::Function, allowmissing::Bool,
allowduplicates::Bool, fill)
rowref = g_rowkey.groups
row_group_row_idxs = find_group_row(g_rowkey)
Nrow = length(g_rowkey)
Expand All @@ -425,22 +490,31 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " *
"Pass `allowmissing=true` to skip missings."))
end
unstacked_val = [fill!(similar(valuecol,
promote_type(eltype(valuecol), typeof(fill)),
Nrow),
fill) for _ in 1:Ncol]

mask_filled = falses(Nrow, Ncol)

@assert length(rowref) == length(colref) == length(valuecol)
for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol))
if !allowduplicates && mask_filled[row_id, col_id]
throw(ArgumentError("Duplicate entries in unstack at row $k for key "*
"$(tuple((df[k, s] for s in rowkeys)...)) and variable $(colref_map[col_id]). " *
"Pass allowduplicates=true to allow them."))

unstacked_val = [fill!(similar(valuecol,
promote_type(eltype(valuecol), typeof(fill)),
Nrow),
fill) for _ in 1:Ncol]

# use a separate path for allowduplicates to reduce memory use and increase speed
if allowduplicates
for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol))
unstacked_val[col_id][row_id] = val
end
else
mask_filled = falses(Nrow, Ncol)
for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol))
if mask_filled[row_id, col_id]
bad_key = tuple((df[k, s] for s in rowkeys)...)
bad_var = colref_map[col_id]
throw(ArgumentError("Duplicate entries in unstack at row $k for key "*
"$bad_key and variable $bad_var. " *
"Pass allowduplicates=true to allow them."))
end
unstacked_val[col_id][row_id] = val
mask_filled[row_id, col_id] = true
end
unstacked_val[col_id][row_id] = val
mask_filled[row_id, col_id] = true
end

# note that Symbol(renamecols(x)) must produce unique column names
Expand All @@ -458,7 +532,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
res_df = hcat(df1, df2, copycols=false)

@assert length(row_group_row_idxs) == nrow(res_df)
# avoid reordering when col_group_row_idxs was already ordered
# avoid reordering when row_group_row_idxs was already ordered
if !issorted(row_group_row_idxs)
res_df = res_df[sortperm(row_group_row_idxs), :]
end
Expand Down
Loading