From 3aaf1a8bdb9bfe611d9230f57067223605c60b53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 09:15:10 +0100 Subject: [PATCH 1/4] Add `scalar` keyword argument to `flatten` --- NEWS.md | 2 + src/abstractdataframe/abstractdataframe.jl | 77 +++++++++++++++++----- test/reshape.jl | 65 ++++++++++++++++++ 3 files changed, 129 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1798a595df..86721af196 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,8 @@ * Add `haskey` and `get` methods to `DataFrameColumns` to make it support dictionary interface more completely ([#3282](https://github.com/JuliaData/DataFrames.jl/pull/3282)) +* Allow passing `scalar` keyword argument in `flatten` + ([#3283](https://github.com/JuliaData/DataFrames.jl/pull/3283)) ## Bug fixes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 3238c4c7ba..3c7a30ce4f 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2259,8 +2259,7 @@ function Missings.allowmissing(df::AbstractDataFrame, end """ - flatten(df::AbstractDataFrame, cols) - + flatten(df::AbstractDataFrame, cols; scalar::Type) When columns `cols` of data frame `df` have iterable elements that define `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each element of each `col` in `cols` is flattened, meaning the column corresponding @@ -2273,6 +2272,11 @@ returned `DataFrame` will affect `df`. `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). +If `scalar` is passed then values that have this type in flattened columns +are treated as scalars and broadcasted as many times as is needed to match +lengths of values stored in other columns. One row is produced if all +corresponding values are scalars. + $METADATA_FIXED # Examples @@ -2334,10 +2338,32 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) julia> flatten(df3, [:b, :c]) ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2 + +julia> df4 = DataFrame(a=[1, 2, 3], + b=[[1, 2], missing, missing], + c=[[5, 6], missing, [7, 8]]) +3×3 DataFrame + Row │ a b c + │ Int64 Array…? Array…? +─────┼───────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 missing missing + 3 │ 3 missing [7, 8] +julia> flatten(df4, [:b, :c], scalar=Missing) +5×3 DataFrame + Row │ a b c + │ Int64 Int64? Int64? +─────┼───────────────────────── + 1 │ 1 1 5 + 2 │ 1 2 6 + 3 │ 2 missing missing + 4 │ 3 missing 7 + 5 │ 3 missing 8 ``` """ function flatten(df::AbstractDataFrame, - cols::Union{ColumnIndex, MultiColumnIndex}) + cols::Union{ColumnIndex, MultiColumnIndex}; + scalar::Type=Union{}) _check_consistency(df) idxcols = index(df)[cols] @@ -2348,15 +2374,16 @@ function flatten(df::AbstractDataFrame, end col1 = first(idxcols) - lengths = length.(df[!, col1]) - for col in idxcols - v = df[!, col] - if any(x -> length(x[1]) != x[2], zip(v, lengths)) - r = findfirst(x -> x != 0, length.(v) .- lengths) - colnames = _names(df) - throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " * - "and :$(colnames[col]) are not the same in row $r")) - end + lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]] + for (i, coli) in enumerate(idxcols) + i == 1 && continue + update_lengths!(lengths, df[!, coli], scalar, df, col1, coli) + end + + # handle case where in all columns we had a scalar + # in this case we keep it one time + for i in 1:length(lengths) + lengths[i] == -1 && (lengths[i] = 1) end new_df = similar(df[!, Not(cols)], sum(lengths)) @@ -2368,9 +2395,14 @@ function flatten(df::AbstractDataFrame, col_to_flatten = df[!, col] fast_path = eltype(col_to_flatten) isa AbstractVector && !isempty(col_to_flatten) - flattened_col = fast_path ? - reduce(vcat, col_to_flatten) : - collect(Iterators.flatten(col_to_flatten)) + flattened_col = if fast_path + reduce(vcat, col_to_flatten) + elseif scalar === Union{} + collect(Iterators.flatten(col_to_flatten)) + else + collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v + for (l, v) in zip(lengths, col_to_flatten))) + end insertcols!(new_df, col, _names(df)[col] => flattened_col) end @@ -2378,6 +2410,21 @@ function flatten(df::AbstractDataFrame, return new_df end +function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type, + df::AbstractDataFrame, col1, coli) + for (i, v) in enumerate(col) + v isa scalar && continue + lv = length(v) + if lengths[i] == -1 + lengths[i] = lv + elseif lengths[i] != lv + colnames = _names(df) + throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " * + "and :$(colnames[coli]) are not the same in row $i")) + end + end +end + function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, lengths::AbstractVector{Int}) counter = 1 diff --git a/test/reshape.jl b/test/reshape.jl index 58cf7bfce0..8a3e75e750 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -431,6 +431,71 @@ end @test flatten(DataFrame(), All()) == DataFrame() end +@testset "flatten with scalar" begin + df = DataFrame(a=[1, 2, 3], + b=[[1, 2], missing, [3, 4]], + c=[[5, 6], missing, missing]) + @test flatten(df, :a) ≅ df + @test_throws MethodError flatten(df, :b) + @test flatten(df, :b, scalar=Missing) ≅ + DataFrame(a=[1, 1, 2, 3, 3], + b=[1, 2, missing, 3, 4], + c=[[5, 6], [5, 6], missing, missing, missing]) + @test flatten(df, [:b, :c], scalar=Missing) ≅ + DataFrame(a=[1, 1, 2, 3, 3], + b=[1, 2, missing, 3, 4], + c=[5, 6, missing, missing, missing]) + @test flatten(df, [:b, :c], scalar=Any) ≅ df + + df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]]) + @test_throws ArgumentError flatten(df, All(), scalar=Missing) + @test flatten(df, Not(:d), scalar=Missing) ≅ + DataFrame(a=missing, b=1, c=missing, d=[[1, 2]]) + @test flatten(df, Not(:b), scalar=Missing) ≅ + DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2]) + + df = DataFrame(a="xy", b=[[1, 2]]) + @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2]) + @test flatten(df, [:a, :b], scalar=String) == + DataFrame(a=["xy", "xy"], b=[1, 2]) + + df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4) + @test flatten(df, [:a, :b], scalar=Missing) ≅ + DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4]) + df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10]) + df.y = [iseven(last(v)) ? missing : v for v in df.x] + @test flatten(df, [:x, :y], scalar=Missing) ≅ + DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]), + x=reduce(vcat, [1:i for i in 1:9]), + y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9])) + + # Below are tests showing handling of strings + df = DataFrame(id=1:5, + col1=["a", missing, 1:2, 3:4, 5:6], + col2=[11:12, 111:112, 1111:1112, missing, "b"]) + @test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅ + DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5], + col1=["a", "a", missing, missing, 1, 2, 3, 4, 5, 6], + col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "b", "b"]) + @test_throws MethodError flatten(df, [:col1, :col2]) + @test_throws ArgumentError flatten(df, [:col1, :col2], scalar=Missing) + @test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString) + + df = DataFrame(id=1:5, + col1=["ab", missing, 1:2, 3:4, 5:6], + col2=[11:12, 111:112, 1111:1112, missing, "cd"]) + @test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅ + DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5], + col1=["ab", "ab", missing, missing, 1, 2, 3, 4, 5, 6], + col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "cd", "cd"]) + @test_throws MethodError flatten(df, [:col1, :col2]) + @test flatten(df, [:col1, :col2], scalar=Missing) ≅ + DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5], + col1=['a', 'b', missing, missing, 1, 2, 3, 4, 5, 6], + col2=[11, 12, 111, 112, 1111, 1112, missing, missing, 'c', 'd']) + @test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString) +end + @testset "stack categorical test" begin Random.seed!(1234) d1 = DataFrame(a=repeat([1:3;], inner=[4]), From 24b54d93aaa0bf927ea5ad9f30c1c7c0bf7ed81b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 13:14:15 +0100 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/abstractdataframe.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 3c7a30ce4f..51c61fd3c8 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2259,7 +2259,8 @@ function Missings.allowmissing(df::AbstractDataFrame, end """ - flatten(df::AbstractDataFrame, cols; scalar::Type) + flatten(df::AbstractDataFrame, cols; scalar::Type=Union{}) + When columns `cols` of data frame `df` have iterable elements that define `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each element of each `col` in `cols` is flattened, meaning the column corresponding @@ -2274,8 +2275,8 @@ returned `DataFrame` will affect `df`. If `scalar` is passed then values that have this type in flattened columns are treated as scalars and broadcasted as many times as is needed to match -lengths of values stored in other columns. One row is produced if all -corresponding values are scalars. +lengths of values stored in other columns. If all values in a row are scalars, +a single row is produced. $METADATA_FIXED @@ -2349,6 +2350,7 @@ julia> df4 = DataFrame(a=[1, 2, 3], 1 │ 1 [1, 2] [5, 6] 2 │ 2 missing missing 3 │ 3 missing [7, 8] + julia> flatten(df4, [:b, :c], scalar=Missing) 5×3 DataFrame Row │ a b c @@ -2411,7 +2413,7 @@ function flatten(df::AbstractDataFrame, end function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type, - df::AbstractDataFrame, col1, coli) + df::AbstractDataFrame, col1::Integer, coli::Integer) for (i, v) in enumerate(col) v isa scalar && continue lv = length(v) From 8b7e50699921e0b744c161e0d476e73184758025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 13:53:02 +0100 Subject: [PATCH 3/4] fix incorrect condition in flatten --- src/abstractdataframe/abstractdataframe.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 51c61fd3c8..4a631aea2e 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2395,13 +2395,16 @@ function flatten(df::AbstractDataFrame, length(idxcols) > 1 && sort!(idxcols) for col in idxcols col_to_flatten = df[!, col] - fast_path = eltype(col_to_flatten) isa AbstractVector && + fast_path = eltype(col_to_flatten) <: AbstractVector && !isempty(col_to_flatten) flattened_col = if fast_path + @info "1" reduce(vcat, col_to_flatten) elseif scalar === Union{} + @info "2" collect(Iterators.flatten(col_to_flatten)) else + @info "3" collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v for (l, v) in zip(lengths, col_to_flatten))) end From ac5a33e093e77074c5a16d50ca7583db24805ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 17:29:50 +0100 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/abstractdataframe.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 4a631aea2e..fe6fb842f5 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2398,13 +2398,10 @@ function flatten(df::AbstractDataFrame, fast_path = eltype(col_to_flatten) <: AbstractVector && !isempty(col_to_flatten) flattened_col = if fast_path - @info "1" reduce(vcat, col_to_flatten) elseif scalar === Union{} - @info "2" collect(Iterators.flatten(col_to_flatten)) else - @info "3" collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v for (l, v) in zip(lengths, col_to_flatten))) end