From fb246d18bc2d484d7a8c99c862493722f952e78f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 16 Feb 2022 10:16:24 +0100 Subject: [PATCH 1/6] make permutedims more flexible --- src/abstractdataframe/reshape.jl | 41 +++++++++++++++++++++++++------- test/reshape.jl | 27 +++++++++++++++++++-- 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 6f187b9905..a4bbcc4310 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -572,7 +572,7 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = """ permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, AbstractString}, [dest_namescol::Union{Symbol, AbstractString}]; - makeunique::Bool=false) + makeunique::Bool=false, strict::Bool=true) Turn `df` on its side such that rows become columns and values in the column indexed by `src_namescol` become the names of new columns. @@ -582,12 +582,16 @@ with name specified by `dest_namescol`. # Arguments - `df` : the `AbstractDataFrame` - `src_namescol` : the column that will become the new header. - This column's element type must be `AbstractString` or `Symbol`. - `dest_namescol` : the name of the first column in the returned `DataFrame`. Defaults to the same name as `src_namescol`. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `strict` : if `true` (the default), an error will be raised if the values + contained in the `src_namescol` are not all `Symbol` or all `AbstractString`, + or can all be converted to `AbstractString` using `convert`. If `false` + then any values are accepted and the will be changed to strings using + the `string` function. Note: The element types of columns in resulting `DataFrame` (other than the first column, which always has element type `String`) @@ -637,34 +641,53 @@ julia> permutedims(df2, 1, "different_name") """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, AbstractString}; - makeunique::Bool=false) + makeunique::Bool=false, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) end - eltype(df[!, src_namescol]) <: SymbolOrString || - throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) + src_col_names = df[!, src_namescol] + local new_col_names + if eltype(src_col_names) <: SymbolOrString + new_col_names = src_col_names + elseif all(x -> x isa Symbol, src_col_names) + new_col_names = collect(Symbol, src_col_names) + elseif !strict + new_col_names = string.(src_col_names) + else + try + new_col_names = collect(AbstractString, src_col_names) + catch e + if e isa MethodError && e.f === convert + throw(ArgumentError("all elements of src_namescol must support " * + "conversion to AbstractString")) + else + rethrow(e) + end + end + end df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame(dest_namescol => names(df_notsrc)) if ncol(df_notsrc) == 0 - df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], df[!, src_namescol], + df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names, makeunique=makeunique, copycols=false) else m = permutedims(Matrix(df_notsrc)) - df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) + df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique) end return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false) end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; - makeunique::Bool=false) + makeunique::Bool=false, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) dest_namescol = _names(df)[src_namescol] else dest_namescol = src_namescol end - return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) + return permutedims(df, src_namescol, dest_namescol; + makeunique=makeunique, strict=strict) end diff --git a/test/reshape.jl b/test/reshape.jl index 288835219e..56e54f2030 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -637,14 +637,37 @@ end @test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[]) # Can't index float Column @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1) - @test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1) + # but can if it is empty + @test permutedims(DataFrame(a=Float64[], b=Float64[]), 1) == DataFrame(a="b") # Can't index columns that allow for missing @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1) - @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1) + # but can if they do not contain missing + @test permutedims(df4[!, [:h, :a, :b]], 1) == permutedims(df4[!, [:e, :a, :b]], 1, :h) # Can't permute empty `df` ... @test_throws BoundsError permutedims(DataFrame(), 1) # ... but can permute zero-row df @test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"]) + + # tests of strict handling + df = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false]) + ref = permutedims(df, 1) + # allowed as contents is strings + df.a = collect(Any, df.a) + @test permutedims(df, 1) == ref + # this is allowed as conversion from categorical to string is allowed + df.a = categorical(df.a) + @test permutedims(df, 1) == ref + # allowed as contents is symbols + df.a = Any[:x, :y] + @test permutedims(df, 1) == ref + # not allowed mixing of strings and symbols + df.a = Any[:x, "y"] + @test_throws ArgumentError permutedims(df, 1) + # not allowed values that cannot be converted to string + df.a = Any['x', 'y'] + @test_throws ArgumentError permutedims(df, 1) + # but allowed with strict=false + @test permutedims(df, 1, strict=false) == ref end @testset "stack view=true additional tests" begin From 6206e7eb1b8628f3ff28d4923c8a250fdd7e47a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 16 Feb 2022 10:19:41 +0100 Subject: [PATCH 2/6] update NEWS.md --- NEWS.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS.md b/NEWS.md index a9e6199b0e..d88932575b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# DataFrames.jl v1.4 Release Notes + +## New functionalities + +* `permutedims` now supports a `strict` keyword argument that allows + for a more flexible handling of values stored in a column that will + become a new header + ([#3004](https://github.com/JuliaData/DataFrames.jl/issues/3004)) + # DataFrames.jl v1.3.2 Patch Release Notes ## Bug fixes From cbf5b2e708338b6fe448d5e5d5d9b3c23edaffa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 17 Feb 2022 09:26:10 +0100 Subject: [PATCH 3/6] Update src/abstractdataframe/reshape.jl --- src/abstractdataframe/reshape.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index a4bbcc4310..3f2c80a669 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -656,7 +656,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, new_col_names = string.(src_col_names) else try - new_col_names = collect(AbstractString, src_col_names) + new_col_names = collect(String, src_col_names) catch e if e isa MethodError && e.f === convert throw(ArgumentError("all elements of src_namescol must support " * From 7b36bda2cbc4dcdda8d78c2af1cad603905f6cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 17 Feb 2022 09:29:32 +0100 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/reshape.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 3f2c80a669..75d168bd9c 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -589,9 +589,9 @@ with name specified by `dest_namescol`. with `_i` (`i` starting at 1 for the first duplicate). - `strict` : if `true` (the default), an error will be raised if the values contained in the `src_namescol` are not all `Symbol` or all `AbstractString`, - or can all be converted to `AbstractString` using `convert`. If `false` + or can all be converted to `String` using `convert`. If `false` then any values are accepted and the will be changed to strings using - the `string` function. + the [`string`](@ref) function. Note: The element types of columns in resulting `DataFrame` (other than the first column, which always has element type `String`) From 4586f90f81e631f27e5b81de0516d4891745ea22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 17 Feb 2022 09:29:40 +0100 Subject: [PATCH 5/6] Update src/abstractdataframe/reshape.jl --- src/abstractdataframe/reshape.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 75d168bd9c..4cf0aaa5cc 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -660,7 +660,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, catch e if e isa MethodError && e.f === convert throw(ArgumentError("all elements of src_namescol must support " * - "conversion to AbstractString")) + "conversion to String")) else rethrow(e) end From 14150be239fdb3b5d7bf2fc9c55c87ef220baf1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 17 Feb 2022 12:20:18 +0100 Subject: [PATCH 6/6] Update src/abstractdataframe/reshape.jl --- src/abstractdataframe/reshape.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 4cf0aaa5cc..174680d8bf 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -591,7 +591,7 @@ with name specified by `dest_namescol`. contained in the `src_namescol` are not all `Symbol` or all `AbstractString`, or can all be converted to `String` using `convert`. If `false` then any values are accepted and the will be changed to strings using - the [`string`](@ref) function. + the `string` function. Note: The element types of columns in resulting `DataFrame` (other than the first column, which always has element type `String`)