From 3aaf1a8bdb9bfe611d9230f57067223605c60b53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 09:15:10 +0100
Subject: [PATCH 1/4] Add `scalar` keyword argument to `flatten`

---
 NEWS.md                                    |  2 +
 src/abstractdataframe/abstractdataframe.jl | 77 +++++++++++++++++-----
 test/reshape.jl                            | 65 ++++++++++++++++++
 3 files changed, 129 insertions(+), 15 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 1798a595df..86721af196 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -26,6 +26,8 @@
 * Add `haskey` and `get` methods to `DataFrameColumns`
   to make it support dictionary interface more completely
   ([#3282](https://github.com/JuliaData/DataFrames.jl/pull/3282))
+* Allow passing `scalar` keyword argument in `flatten`
+  ([#3283](https://github.com/JuliaData/DataFrames.jl/pull/3283))
 
 ## Bug fixes
 
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 3238c4c7ba..3c7a30ce4f 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2259,8 +2259,7 @@ function Missings.allowmissing(df::AbstractDataFrame,
 end
 
 """
-    flatten(df::AbstractDataFrame, cols)
-
+    flatten(df::AbstractDataFrame, cols; scalar::Type)
 When columns `cols` of data frame `df` have iterable elements that define
 `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
 element of each `col` in `cols` is flattened, meaning the column corresponding
@@ -2273,6 +2272,11 @@ returned `DataFrame` will affect `df`.
 
 `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
 
+If `scalar` is passed then values that have this type in flattened columns
+are treated as scalars and broadcasted as many times as is needed to match
+lengths of values stored in other columns. One row is produced if all
+corresponding values are scalars.
+
 $METADATA_FIXED
 
 # Examples
@@ -2334,10 +2338,32 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
 
 julia> flatten(df3, [:b, :c])
 ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
+
+julia> df4 = DataFrame(a=[1, 2, 3],
+                       b=[[1, 2], missing, missing],
+                       c=[[5, 6], missing, [7, 8]])
+3×3 DataFrame
+ Row │ a      b        c
+     │ Int64  Array…?  Array…?
+─────┼─────────────────────────
+   1 │     1  [1, 2]   [5, 6]
+   2 │     2  missing  missing
+   3 │     3  missing  [7, 8]
+julia> flatten(df4, [:b, :c], scalar=Missing)
+5×3 DataFrame
+ Row │ a      b        c
+     │ Int64  Int64?   Int64?
+─────┼─────────────────────────
+   1 │     1        1        5
+   2 │     1        2        6
+   3 │     2  missing  missing
+   4 │     3  missing        7
+   5 │     3  missing        8
 ```
 """
 function flatten(df::AbstractDataFrame,
-                 cols::Union{ColumnIndex, MultiColumnIndex})
+                 cols::Union{ColumnIndex, MultiColumnIndex};
+                 scalar::Type=Union{})
     _check_consistency(df)
 
     idxcols = index(df)[cols]
@@ -2348,15 +2374,16 @@ function flatten(df::AbstractDataFrame,
     end
 
     col1 = first(idxcols)
-    lengths = length.(df[!, col1])
-    for col in idxcols
-        v = df[!, col]
-        if any(x -> length(x[1]) != x[2], zip(v, lengths))
-            r = findfirst(x -> x != 0, length.(v) .- lengths)
-            colnames = _names(df)
-            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-                                "and :$(colnames[col]) are not the same in row $r"))
-        end
+    lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]]
+    for (i, coli) in enumerate(idxcols)
+        i == 1 && continue
+        update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
+    end
+
+    # handle case where in all columns we had a scalar
+    # in this case we keep it one time
+    for i in 1:length(lengths)
+        lengths[i] == -1 && (lengths[i] = 1)
     end
 
     new_df = similar(df[!, Not(cols)], sum(lengths))
@@ -2368,9 +2395,14 @@ function flatten(df::AbstractDataFrame,
         col_to_flatten = df[!, col]
         fast_path = eltype(col_to_flatten) isa AbstractVector &&
                     !isempty(col_to_flatten)
-        flattened_col = fast_path ?
-            reduce(vcat, col_to_flatten) :
-            collect(Iterators.flatten(col_to_flatten))
+        flattened_col = if fast_path
+                reduce(vcat, col_to_flatten)
+            elseif scalar === Union{}
+                collect(Iterators.flatten(col_to_flatten))
+            else
+                collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
+                                          for (l, v) in zip(lengths, col_to_flatten)))
+            end
         insertcols!(new_df, col, _names(df)[col] => flattened_col)
     end
 
@@ -2378,6 +2410,21 @@ function flatten(df::AbstractDataFrame,
     return new_df
 end
 
+function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
+                         df::AbstractDataFrame, col1, coli)
+    for (i, v) in enumerate(col)
+        v isa scalar && continue
+        lv = length(v)
+        if lengths[i] == -1
+            lengths[i] = lv
+        elseif lengths[i] != lv
+            colnames = _names(df)
+            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
+                                "and :$(colnames[coli]) are not the same in row $i"))
+        end
+    end
+end
+
 function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
                          lengths::AbstractVector{Int})
     counter = 1
diff --git a/test/reshape.jl b/test/reshape.jl
index 58cf7bfce0..8a3e75e750 100644
--- a/test/reshape.jl
+++ b/test/reshape.jl
@@ -431,6 +431,71 @@ end
     @test flatten(DataFrame(), All()) == DataFrame()
 end
 
+@testset "flatten with scalar" begin
+    df = DataFrame(a=[1, 2, 3],
+                   b=[[1, 2], missing, [3, 4]],
+                   c=[[5, 6], missing, missing])
+    @test flatten(df, :a) ≅ df
+    @test_throws MethodError flatten(df, :b)
+    @test flatten(df, :b, scalar=Missing) ≅
+          DataFrame(a=[1, 1, 2, 3, 3],
+                    b=[1, 2, missing, 3, 4],
+                    c=[[5, 6], [5, 6], missing, missing, missing])
+    @test flatten(df, [:b, :c], scalar=Missing) ≅
+          DataFrame(a=[1, 1, 2, 3, 3],
+                    b=[1, 2, missing, 3, 4],
+                    c=[5, 6, missing, missing, missing])
+    @test flatten(df, [:b, :c], scalar=Any) ≅ df
+
+    df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]])
+    @test_throws ArgumentError flatten(df, All(), scalar=Missing)
+    @test flatten(df, Not(:d), scalar=Missing) ≅
+        DataFrame(a=missing, b=1, c=missing, d=[[1, 2]])
+    @test flatten(df, Not(:b), scalar=Missing) ≅
+        DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2])
+
+    df = DataFrame(a="xy", b=[[1, 2]])
+    @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2])
+    @test flatten(df, [:a, :b], scalar=String) ==
+          DataFrame(a=["xy", "xy"], b=[1, 2])
+
+    df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4)
+    @test flatten(df, [:a, :b], scalar=Missing) ≅
+          DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4])
+    df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10])
+    df.y = [iseven(last(v)) ? missing : v for v in df.x]
+    @test flatten(df, [:x, :y], scalar=Missing) ≅
+          DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]),
+                    x=reduce(vcat, [1:i for i in 1:9]),
+                    y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9]))
+
+    # Below are tests showing handling of strings
+    df = DataFrame(id=1:5,
+                   col1=["a", missing, 1:2, 3:4, 5:6],
+                   col2=[11:12, 111:112, 1111:1112, missing, "b"])
+    @test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅
+          DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
+                    col1=["a", "a", missing, missing, 1, 2, 3, 4, 5, 6],
+                    col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "b", "b"])
+    @test_throws MethodError flatten(df, [:col1, :col2])
+    @test_throws ArgumentError flatten(df, [:col1, :col2], scalar=Missing)
+    @test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)
+
+    df = DataFrame(id=1:5,
+                   col1=["ab", missing, 1:2, 3:4, 5:6],
+                   col2=[11:12, 111:112, 1111:1112, missing, "cd"])
+    @test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅
+          DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
+                    col1=["ab", "ab", missing, missing, 1, 2, 3, 4, 5, 6],
+                    col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "cd", "cd"])
+    @test_throws MethodError flatten(df, [:col1, :col2])
+    @test flatten(df, [:col1, :col2], scalar=Missing) ≅
+          DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
+                    col1=['a', 'b', missing, missing, 1, 2, 3, 4, 5, 6],
+                    col2=[11, 12, 111, 112, 1111, 1112, missing, missing, 'c', 'd'])
+    @test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)
+end
+
 @testset "stack categorical test" begin
     Random.seed!(1234)
     d1 = DataFrame(a=repeat([1:3;], inner=[4]),

From 24b54d93aaa0bf927ea5ad9f30c1c7c0bf7ed81b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 13:14:15 +0100
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/abstractdataframe.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 3c7a30ce4f..51c61fd3c8 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2259,7 +2259,8 @@ function Missings.allowmissing(df::AbstractDataFrame,
 end
 
 """
-    flatten(df::AbstractDataFrame, cols; scalar::Type)
+    flatten(df::AbstractDataFrame, cols; scalar::Type=Union{})
+
 When columns `cols` of data frame `df` have iterable elements that define
 `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
 element of each `col` in `cols` is flattened, meaning the column corresponding
@@ -2274,8 +2275,8 @@ returned `DataFrame` will affect `df`.
 
 If `scalar` is passed then values that have this type in flattened columns
 are treated as scalars and broadcasted as many times as is needed to match
-lengths of values stored in other columns. One row is produced if all
-corresponding values are scalars.
+lengths of values stored in other columns. If all values in a row are scalars,
+a single row is produced.
 
 $METADATA_FIXED
 
@@ -2349,6 +2350,7 @@ julia> df4 = DataFrame(a=[1, 2, 3],
    1 │     1  [1, 2]   [5, 6]
    2 │     2  missing  missing
    3 │     3  missing  [7, 8]
+
 julia> flatten(df4, [:b, :c], scalar=Missing)
 5×3 DataFrame
  Row │ a      b        c
@@ -2411,7 +2413,7 @@ function flatten(df::AbstractDataFrame,
 end
 
 function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
-                         df::AbstractDataFrame, col1, coli)
+                         df::AbstractDataFrame, col1::Integer, coli::Integer)
     for (i, v) in enumerate(col)
         v isa scalar && continue
         lv = length(v)

From 8b7e50699921e0b744c161e0d476e73184758025 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 13:53:02 +0100
Subject: [PATCH 3/4] fix incorrect condition in flatten

---
 src/abstractdataframe/abstractdataframe.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 51c61fd3c8..4a631aea2e 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2395,13 +2395,16 @@ function flatten(df::AbstractDataFrame,
     length(idxcols) > 1 && sort!(idxcols)
     for col in idxcols
         col_to_flatten = df[!, col]
-        fast_path = eltype(col_to_flatten) isa AbstractVector &&
+        fast_path = eltype(col_to_flatten) <: AbstractVector &&
                     !isempty(col_to_flatten)
         flattened_col = if fast_path
+                @info "1"
                 reduce(vcat, col_to_flatten)
             elseif scalar === Union{}
+                @info "2"
                 collect(Iterators.flatten(col_to_flatten))
             else
+                @info "3"
                 collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
                                           for (l, v) in zip(lengths, col_to_flatten)))
             end

From ac5a33e093e77074c5a16d50ca7583db24805ecb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 17:29:50 +0100
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/abstractdataframe.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 4a631aea2e..fe6fb842f5 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2398,13 +2398,10 @@ function flatten(df::AbstractDataFrame,
         fast_path = eltype(col_to_flatten) <: AbstractVector &&
                     !isempty(col_to_flatten)
         flattened_col = if fast_path
-                @info "1"
                 reduce(vcat, col_to_flatten)
             elseif scalar === Union{}
-                @info "2"
                 collect(Iterators.flatten(col_to_flatten))
             else
-                @info "3"
                 collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
                                           for (l, v) in zip(lengths, col_to_flatten)))
             end