Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scalar keyword argument to flatten #3283

Merged
merged 4 commits into from
Feb 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
* Add `haskey` and `get` methods to `DataFrameColumns`
to make it support dictionary interface more completely
([#3282](https://github.com/JuliaData/DataFrames.jl/pull/3282))
* Allow passing `scalar` keyword argument in `flatten`
([#3283](https://github.com/JuliaData/DataFrames.jl/pull/3283))

## Bug fixes

Expand Down
79 changes: 64 additions & 15 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2259,7 +2259,7 @@ function Missings.allowmissing(df::AbstractDataFrame,
end

"""
flatten(df::AbstractDataFrame, cols)
flatten(df::AbstractDataFrame, cols; scalar::Type=Union{})

When columns `cols` of data frame `df` have iterable elements that define
`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
Expand All @@ -2273,6 +2273,11 @@ returned `DataFrame` will affect `df`.

`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).

If `scalar` is passed then values that have this type in flattened columns
are treated as scalars and broadcasted as many times as is needed to match
lengths of values stored in other columns. If all values in a row are scalars,
a single row is produced.

$METADATA_FIXED

# Examples
Expand Down Expand Up @@ -2334,10 +2339,33 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])

julia> flatten(df3, [:b, :c])
ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2

julia> df4 = DataFrame(a=[1, 2, 3],
b=[[1, 2], missing, missing],
c=[[5, 6], missing, [7, 8]])
3×3 DataFrame
Row │ a b c
│ Int64 Array…? Array…?
─────┼─────────────────────────
1 │ 1 [1, 2] [5, 6]
2 │ 2 missing missing
3 │ 3 missing [7, 8]

julia> flatten(df4, [:b, :c], scalar=Missing)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
5×3 DataFrame
Row │ a b c
│ Int64 Int64? Int64?
─────┼─────────────────────────
1 │ 1 1 5
2 │ 1 2 6
3 │ 2 missing missing
4 │ 3 missing 7
5 │ 3 missing 8
```
"""
function flatten(df::AbstractDataFrame,
cols::Union{ColumnIndex, MultiColumnIndex})
cols::Union{ColumnIndex, MultiColumnIndex};
scalar::Type=Union{})
_check_consistency(df)

idxcols = index(df)[cols]
Expand All @@ -2348,15 +2376,16 @@ function flatten(df::AbstractDataFrame,
end

col1 = first(idxcols)
lengths = length.(df[!, col1])
for col in idxcols
v = df[!, col]
if any(x -> length(x[1]) != x[2], zip(v, lengths))
r = findfirst(x -> x != 0, length.(v) .- lengths)
colnames = _names(df)
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
"and :$(colnames[col]) are not the same in row $r"))
end
lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]]
for (i, coli) in enumerate(idxcols)
i == 1 && continue
update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
end

# handle case where in all columns we had a scalar
# in this case we keep it one time
for i in 1:length(lengths)
lengths[i] == -1 && (lengths[i] = 1)
end

new_df = similar(df[!, Not(cols)], sum(lengths))
Expand All @@ -2366,18 +2395,38 @@ function flatten(df::AbstractDataFrame,
length(idxcols) > 1 && sort!(idxcols)
for col in idxcols
col_to_flatten = df[!, col]
fast_path = eltype(col_to_flatten) isa AbstractVector &&
fast_path = eltype(col_to_flatten) <: AbstractVector &&
!isempty(col_to_flatten)
flattened_col = fast_path ?
reduce(vcat, col_to_flatten) :
collect(Iterators.flatten(col_to_flatten))
flattened_col = if fast_path
reduce(vcat, col_to_flatten)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
elseif scalar === Union{}
collect(Iterators.flatten(col_to_flatten))
else
collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
for (l, v) in zip(lengths, col_to_flatten)))
end
insertcols!(new_df, col, _names(df)[col] => flattened_col)
end

_copy_all_note_metadata!(new_df, df)
return new_df
end

function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
df::AbstractDataFrame, col1::Integer, coli::Integer)
for (i, v) in enumerate(col)
v isa scalar && continue
lv = length(v)
if lengths[i] == -1
lengths[i] = lv
elseif lengths[i] != lv
colnames = _names(df)
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
"and :$(colnames[coli]) are not the same in row $i"))
end
end
end

function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
lengths::AbstractVector{Int})
counter = 1
Expand Down
65 changes: 65 additions & 0 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,71 @@ end
@test flatten(DataFrame(), All()) == DataFrame()
end

@testset "flatten with scalar" begin
df = DataFrame(a=[1, 2, 3],
b=[[1, 2], missing, [3, 4]],
c=[[5, 6], missing, missing])
@test flatten(df, :a) ≅ df
@test_throws MethodError flatten(df, :b)
@test flatten(df, :b, scalar=Missing) ≅
DataFrame(a=[1, 1, 2, 3, 3],
b=[1, 2, missing, 3, 4],
c=[[5, 6], [5, 6], missing, missing, missing])
@test flatten(df, [:b, :c], scalar=Missing) ≅
DataFrame(a=[1, 1, 2, 3, 3],
b=[1, 2, missing, 3, 4],
c=[5, 6, missing, missing, missing])
@test flatten(df, [:b, :c], scalar=Any) ≅ df

df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]])
@test_throws ArgumentError flatten(df, All(), scalar=Missing)
@test flatten(df, Not(:d), scalar=Missing) ≅
DataFrame(a=missing, b=1, c=missing, d=[[1, 2]])
@test flatten(df, Not(:b), scalar=Missing) ≅
DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2])

df = DataFrame(a="xy", b=[[1, 2]])
@test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2])
@test flatten(df, [:a, :b], scalar=String) ==
DataFrame(a=["xy", "xy"], b=[1, 2])

df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4)
@test flatten(df, [:a, :b], scalar=Missing) ≅
DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4])
df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10])
df.y = [iseven(last(v)) ? missing : v for v in df.x]
@test flatten(df, [:x, :y], scalar=Missing) ≅
DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]),
x=reduce(vcat, [1:i for i in 1:9]),
y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9]))

# Below are tests showing handling of strings
df = DataFrame(id=1:5,
col1=["a", missing, 1:2, 3:4, 5:6],
col2=[11:12, 111:112, 1111:1112, missing, "b"])
@test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅
DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
col1=["a", "a", missing, missing, 1, 2, 3, 4, 5, 6],
col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "b", "b"])
@test_throws MethodError flatten(df, [:col1, :col2])
@test_throws ArgumentError flatten(df, [:col1, :col2], scalar=Missing)
@test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)

df = DataFrame(id=1:5,
col1=["ab", missing, 1:2, 3:4, 5:6],
col2=[11:12, 111:112, 1111:1112, missing, "cd"])
@test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅
DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
col1=["ab", "ab", missing, missing, 1, 2, 3, 4, 5, 6],
col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "cd", "cd"])
@test_throws MethodError flatten(df, [:col1, :col2])
@test flatten(df, [:col1, :col2], scalar=Missing) ≅
DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
col1=['a', 'b', missing, missing, 1, 2, 3, 4, 5, 6],
col2=[11, 12, 111, 112, 1111, 1112, missing, missing, 'c', 'd'])
@test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)
end

@testset "stack categorical test" begin
Random.seed!(1234)
d1 = DataFrame(a=repeat([1:3;], inner=[4]),
Expand Down