Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make permutedims more flexible #3004

Merged
merged 7 commits into from
Feb 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## New functionalities

* `permutedims` now supports a `strict` keyword argument that allows
for a more flexible handling of values stored in a column that will
become a new header
([#3004](https://github.com/JuliaData/DataFrames.jl/issues/3004))
* `unstack` now allows passing a function in `valuestransform` keyword argument;
this allows for a convenient creation of two dimensional pivot tables
([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998))
Expand Down
41 changes: 32 additions & 9 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) =
"""
permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, AbstractString},
[dest_namescol::Union{Symbol, AbstractString}];
makeunique::Bool=false)
makeunique::Bool=false, strict::Bool=true)

Turn `df` on its side such that rows become columns
and values in the column indexed by `src_namescol` become the names of new columns.
Expand All @@ -656,12 +656,16 @@ with name specified by `dest_namescol`.
# Arguments
- `df` : the `AbstractDataFrame`
- `src_namescol` : the column that will become the new header.
This column's element type must be `AbstractString` or `Symbol`.
- `dest_namescol` : the name of the first column in the returned `DataFrame`.
Defaults to the same name as `src_namescol`.
- `makeunique` : if `false` (the default), an error will be raised
if duplicate names are found; if `true`, duplicate names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).
- `strict` : if `true` (the default), an error will be raised if the values
contained in the `src_namescol` are not all `Symbol` or all `AbstractString`,
or can all be converted to `String` using `convert`. If `false`
then any values are accepted and the will be changed to strings using
the `string` function.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
the `string` function.
the [`string`](@ref) function.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nalimilan would such reference to Base Julia work?

CC @mortenpi

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I have accidentally committed this suggestion, but the question remains)

Thank you!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK - no need to answer - CI gave me the answer. I revert the change.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes sorry that doesn't work indeed. So my net contribution to this PR is clearly negative...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

your 1 PR approval is worth 1000 LOC of contribution 😄


Note: The element types of columns in resulting `DataFrame`
(other than the first column, which always has element type `String`)
Expand Down Expand Up @@ -711,34 +715,53 @@ julia> permutedims(df2, 1, "different_name")
"""
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
dest_namescol::Union{Symbol, AbstractString};
makeunique::Bool=false)
makeunique::Bool=false, strict::Bool=true)

if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
end
eltype(df[!, src_namescol]) <: SymbolOrString ||
throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`"))
src_col_names = df[!, src_namescol]
local new_col_names
if eltype(src_col_names) <: SymbolOrString
new_col_names = src_col_names
elseif all(x -> x isa Symbol, src_col_names)
new_col_names = collect(Symbol, src_col_names)
elseif !strict
new_col_names = string.(src_col_names)
else
try
new_col_names = collect(String, src_col_names)
catch e
if e isa MethodError && e.f === convert
throw(ArgumentError("all elements of src_namescol must support " *
"conversion to String"))
else
rethrow(e)
end
end
end

df_notsrc = df[!, Not(src_namescol)]
df_permuted = DataFrame(dest_namescol => names(df_notsrc))

if ncol(df_notsrc) == 0
df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], df[!, src_namescol],
df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names,
makeunique=makeunique, copycols=false)
else
m = permutedims(Matrix(df_notsrc))
df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique)
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique)
end
return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
end

function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
makeunique::Bool=false)
makeunique::Bool=false, strict::Bool=true)
if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
dest_namescol = _names(df)[src_namescol]
else
dest_namescol = src_namescol
end
return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique)
return permutedims(df, src_namescol, dest_namescol;
makeunique=makeunique, strict=strict)
end
27 changes: 25 additions & 2 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -637,14 +637,37 @@ end
@test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[])
# Can't index float Column
@test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1)
@test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1)
# but can if it is empty
@test permutedims(DataFrame(a=Float64[], b=Float64[]), 1) == DataFrame(a="b")
# Can't index columns that allow for missing
@test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1)
@test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1)
# but can if they do not contain missing
@test permutedims(df4[!, [:h, :a, :b]], 1) == permutedims(df4[!, [:e, :a, :b]], 1, :h)
# Can't permute empty `df` ...
@test_throws BoundsError permutedims(DataFrame(), 1)
# ... but can permute zero-row df
@test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"])

# tests of strict handling
df = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
ref = permutedims(df, 1)
# allowed as contents is strings
df.a = collect(Any, df.a)
@test permutedims(df, 1) == ref
# this is allowed as conversion from categorical to string is allowed
df.a = categorical(df.a)
@test permutedims(df, 1) == ref
# allowed as contents is symbols
df.a = Any[:x, :y]
@test permutedims(df, 1) == ref
# not allowed mixing of strings and symbols
df.a = Any[:x, "y"]
@test_throws ArgumentError permutedims(df, 1)
# not allowed values that cannot be converted to string
df.a = Any['x', 'y']
@test_throws ArgumentError permutedims(df, 1)
# but allowed with strict=false
@test permutedims(df, 1, strict=false) == ref
end

@testset "stack view=true additional tests" begin
Expand Down