Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow push!/pushfirst!/append!/prepend! with multiple values #3372

Merged
merged 17 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# DataFrames.jl v1.7.0 Release Notes

## New functionalities

* Allow passing multiple values to add in `push!`, `pushfirst!`,
`append!`, and `prepend!`
([#3372](https://github.com/JuliaData/DataFrames.jl/pull/3372))

# DataFrames.jl v1.6.1 Release Notes

## Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DataFrames"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"
version = "1.7.0"

[deps]
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
Expand Down
173 changes: 156 additions & 17 deletions src/dataframe/insertion.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
"""
append!(df::DataFrame, df2::AbstractDataFrame; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
append!(df::DataFrame, table; cols::Symbol=:setequal,
append!(df::DataFrame, tables...; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))

Add the rows of `df2` to the end of `df`. If the second argument `table` is not
an `AbstractDataFrame` then it is converted using `DataFrame(table,
copycols=false)` before being appended.
Add the rows of tables passed as `tables` to the end of `df`. If the table is not
an `AbstractDataFrame` then it is converted using
`DataFrame(table, copycols=false)` before being appended.

The exact behavior of `append!` depends on the `cols` argument:
* If `cols == :setequal` (this is the default) then `df2` must contain exactly
Expand Down Expand Up @@ -78,18 +76,53 @@ julia> df1
4 │ 4 4
5 │ 5 5
6 │ 6 6

julia> append!(df2, DataFrame(A=1), (; C=1:2), cols=:union)
6×3 DataFrame
Row │ A B C
│ Float64? Int64? Int64?
─────┼─────────────────────────────
1 │ 4.0 4 missing
2 │ 5.0 5 missing
3 │ 6.0 6 missing
4 │ 1.0 missing missing
5 │ missing missing 1
6 │ missing missing 2
```
"""
Base.append!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset])) =
_append_or_prepend!(df1, df2, cols=cols, promote=promote, atend=true)

function Base.append!(df::DataFrame, table; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if table isa Dict && cols == :orderequal
throw(ArgumentError("passing `Dict` as `table` when `cols` is equal to " *
"`:orderequal` is not allowed as it is unordered"))
end
append!(df, DataFrame(table, copycols=false), cols=cols, promote=promote)
end

function Base.append!(df::DataFrame, @nospecialize tables...;
cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if !(cols in (:orderequal, :setequal, :intersect, :subset, :union))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal, :setequal, :intersect, :subset or :union)"))
end

return foldl((df, table) -> append!(df, table, cols=cols, promote=promote),
collect(Any, tables), init=df)
end

"""
prepend!(df::DataFrame, df2::AbstractDataFrame; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
prepend!(df::DataFrame, table; cols::Symbol=:setequal,
prepend!(df::DataFrame, tables...; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))

Add the rows of tables passed as `tables` to the beginning of `df`. If the table is not
an `AbstractDataFrame` then it is converted using
`DataFrame(table, copycols=false)` before being appended.

Add the rows of `df2` to the beginning of `df`. If the second argument `table`
is not an `AbstractDataFrame` then it is converted using `DataFrame(table,
copycols=false)` before being prepended.
Expand Down Expand Up @@ -164,12 +197,45 @@ julia> df1
4 │ 1 1
5 │ 2 2
6 │ 3 3

julia> prepend!(df2, DataFrame(A=1), (; C=1:2), cols=:union)
6×3 DataFrame
Row │ A B C
│ Float64? Int64? Int64?
─────┼─────────────────────────────
1 │ 1.0 missing missing
2 │ missing missing 1
3 │ missing missing 2
4 │ 4.0 4 missing
5 │ 5.0 5 missing
6 │ 6.0 6 missing
```
"""
Base.prepend!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset])) =
_append_or_prepend!(df1, df2, cols=cols, promote=promote, atend=false)

function Base.prepend!(df::DataFrame, table; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if table isa Dict && cols == :orderequal
throw(ArgumentError("passing `Dict` as `table` when `cols` is equal to " *
"`:orderequal` is not allowed as it is unordered"))
end
prepend!(df, DataFrame(table, copycols=false), cols=cols, promote=promote)
end

function Base.prepend!(df::DataFrame, @nospecialize tables...;
cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if !(cols in (:orderequal, :setequal, :intersect, :subset, :union))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal, :setequal, :intersect, :subset or :union)"))
end

return foldr((table, df) -> prepend!(df, table, cols=cols, promote=promote),
collect(Any, tables), init=df)
end

function _append_or_prepend!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol,
promote::Bool, atend::Bool)
if !(cols in (:orderequal, :setequal, :intersect, :subset, :union))
Expand Down Expand Up @@ -355,6 +421,10 @@ following way:
added to `df` (using `missing` for existing rows) and a `missing` value is
pushed to columns missing in `row` that are present in `df`.

If `row` is not a `DataFrameRow`, `NamedTuple`, `AbstractDict`, or `Tables.AbstractRow`
the value of the `cols` argument is ignored and it is only allowed to set it to
`:setequal` or `:orderequal`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we allow :orderequal? I imagine this value is used only when you want to protect yourself from possible inversions of columns, and without names we cannot guarantee that. (Of course :setequal is also a bit weird but we need to allow the default.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue is that when we allow for push!(df, (1,2), (a=1, b=2), cols=:orderequal). That is - we allow for mixing rows with names and without names, in which case cols=:orderequal makes sense in some cases. That is why I allowed for cols in the first place.

If we wanted to disallow this (i.e. disallow mixing named and unnamed containers, which I would also be OK with) then I will redesign the proposal and disallow cols when unnamed containers are passed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we wanted to disallow this (i.e. disallow mixing named and unnamed containers, which I would also be OK with) then I will redesign the proposal and disallow cols when unnamed containers are passed.

If that's not too complex, it would be safer to do that, yes. It shouldn't be super common to add several rows of different types at the same time, and people can use two calls if needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have updated the code to disallow mixing.


Also I want to check with you the idea that we could allow to write:


julia> pushfirst!(DataFrame(), ByRow([(;a=1), (;b=2)]), cols=:union)
2×2 DataFrame
 Row │ b        a       
     │ Int64?   Int64?
─────┼──────────────────
   1 │ missing        1
   2 │       2  missing

instead of current:

pushfirst!(DataFrame(), [(;a=1), (;b=2)]..., cols=:union)

as the latter gets problematic for a lot of passed arguments case:

julia> @time pushfirst!(DataFrame(), ByRow(repeat([(;a=1), (;b=2)], 10000)), cols=:union);
  0.009458 seconds (277.47 k allocations: 12.276 MiB)

julia> @time pushfirst!(DataFrame(), repeat([(;a=1), (;b=2)], 10000)..., cols=:union);
  1.969564 seconds (397.48 k allocations: 5.976 GiB, 16.87% gc time)

This is a bit of overuse of ByRow (which was designed for other purpose), but I found it a natural name. What do you think? (if you think it is OK, then the same decision is with append! and prepend! - do we also want to allow for the ByRow option instead of splatting multiple tables?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a fan TBH. If you have a collection of rows, wouldn't it be more logical to use append! or prepend!? Currently you can do append!(DataFrame(), Tables.dictrowtable(repeat([(;a=1), (;b=2)], 10000)), cols=:union), right? It's not super easy to discover, but the ByRow trick isn't either.

Maybe we could simplify this somehow? For example, would there be a way to make append!(DataFrame(), repeat([(;a=1), (;b=2)], 10000), cols=:union) automatically wrap the input vector in a Tables.dictrowtable if needed?

Copy link
Member Author

@bkamins bkamins Sep 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In short: we are looking for equivalent of Tables.dictrowtable that would be lazy and allocate less (and it is enough that it supports Tables.AbstractRow interface, it does not have to be a dictionary.


In detail: When you do:

julia> x = repeat([(;a=1), (;b=2)], 10^6);

julia> @time Tables.dictrowtable(x)
  4.747105 seconds (42.48 M allocations: 2.623 GiB, 21.35% gc time, 6.76% compilation time)
Tables.DictRowTable([:a, :b], Dict{Symbol, Type}(:a => Union{Missing, Int64}, :b => Union{Missing, Int64}), Dict{Symbol, Any}[Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2)  …  Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2)])

julia> @time Tables.dictrowtable(x)
  4.960894 seconds (42.00 M allocations: 2.593 GiB, 30.86% gc time)
Tables.DictRowTable([:a, :b], Dict{Symbol, Type}(:a => Union{Missing, Int64}, :b => Union{Missing, Int64}), Dict{Symbol, Any}[Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2)  …  Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2), Dict(:a => 1), Dict(:b => 2)])

it is very slow and allocates a lot. The reason is that Tables.DictRowTable has three fields:
:names, :types, and :values. And :values is eager and creates Dict for each entry.

The lazy variant could have :names and :types fields but :values could just point lazily to the source table and later iterate its rows when needed. In particular if source table is columnar creating such an iterator should be super cheap (as we do not even need to iterate it most likely). If the source table has row-wise storage then we would need to iterate it once.

This was the initial idea. In short, to reduce the cost of Tables.dictrowtable.


Now regarding your proposal:

Some modification to Tables.rows or Tables.columns that would do column-unioning if requested?

This is a valid way to implement it and maybe indeed better and sufficient (i.e. we do not have to be lazy as e.g Tables.columns can materialize the columns provided it is efficient).
The point is that if we could pass Tables.columns(x, cols=:union) in the original code this is exactly what is needed.

Then the cols kwarg ideally could have the following values:

  • If cols == :setequal (this is the default) then rows must contain exactly the same columns (but possibly in a different order), order is defined by the first row.
  • If cols == :orderequal then rows must contain the same columns in the same order
  • If cols == :intersect then rows may contain more columns than first row, but all column names that are present in first row must be present in all rows and only they are used to populate a new rows (this is the current behavior).
  • If cols == :subset then the behavior is like for :intersect but if some column is missing in rows then a missing value is pushed.
  • If cols == :union then column unioning is performed

Of course we do not have to support all. I think natural options that must be supported are:

  • :intersect (as this is the current behavior)
  • :union (this is the most commonly requested extension)
  • :orderequal (this is what probably people typically expect by default as they even do not know that the current behavior is :intersect)

The reason is that we then could call e.g. DataFrame(Tables.columns(x, cols=:union)) and it would be very fast, while now the operation is super slow:

julia> @time DataFrame(Tables.dictrowtable(x));
  6.498086 seconds (47.27 M allocations: 2.757 GiB, 17.54% gc time, 6.79% compilation time)

julia> @time DataFrame(Tables.dictrowtable(x));
  4.213657 seconds (46.00 M allocations: 2.686 GiB, 19.43% gc time)

compare it to:

julia> y = repeat([(;a=1, b=2)], 2*10^6);

julia> @time DataFrame(y);
  0.102461 seconds (160.85 k allocations: 41.335 MiB, 93.52% compilation time)

julia> @time DataFrame(y);
  0.008306 seconds (22 allocations: 30.519 MiB)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the upper bound for what is achievable for the example data is:

julia> x = repeat([(;a=1), (;b=2)], 10^6);

julia> @time (df = DataFrame(); foreach(row -> push!(df, row, cols=:union), x))
  1.242811 seconds (28.00 M allocations: 1.204 GiB, 1.52% compilation time)

This is still a bit inefficient (as we are adding data to the df row by row, trying to do union each time), but it is already much faster than Tables.dictrowtable, so I expect that it is possible to get a sub-second performance.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I was thinking that append!(..., cols=:union) would call DataFrame(Tables.column(t, cols=:union)). It seems enough to have Tables.columns support cols=:union for that. cols=:orderequal and cols=:setequal would also make sense but I wouldn't use them in append!.

OTC, adding cols=:subset doesn't sound like a good idea to me as it seems weird to me to take the first row as a reference. The fact that the current behavior does that isn't great IMO. At any rate calling it cols=:intersect could be confusing as I would expect it to only retain columns that appear in all rows.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I realized that we already have Tables.dictcolumntable which is (as expected) much faster:

julia> @time DataFrame(Tables.dictcolumntable(x));
  1.057388 seconds (10.00 M allocations: 297.543 MiB)

julia> @time DataFrame(Tables.dictcolumntable(x));
  1.206029 seconds (10.00 M allocations: 297.543 MiB)

Copy link
Member Author

@bkamins bkamins Oct 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nalimilan - let us merge this PR (if you are OK with it). Then I will make a separate PR that instead of your proposed DataFrame(Tables.column(t, cols=:union)) will call DataFrame(Tables.dictcolumntable(t)) when cols=:union. This should be a good enough solution.


If `promote=true` and element type of a column present in `df` does not allow
the type of a pushed argument then a new column with a promoted element type
allowing it is freshly allocated and stored in `df`. If `promote=false` an error
Expand All @@ -371,12 +441,16 @@ $METADATA_FIXED
"""

"""
push!(df::DataFrame, row::Union{Tuple, AbstractArray}; promote::Bool=false)
push!(df::DataFrame, row::Union{Tuple, AbstractArray};
cols::Symbol=:setequal, promote::Bool=false)
push!(df::DataFrame, row::Union{DataFrameRow, NamedTuple, AbstractDict,
Tables.AbstractRow};
cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset]))
push!(df::DataFrame, rows...;
cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset]))

Add one row at the end of `df` in-place, taking the values from `row`.
Several rows can be added by passing them as separate arguments from `rows`.

$INSERTION_COMMON

Expand Down Expand Up @@ -452,18 +526,38 @@ julia> push!(df, NamedTuple(), cols=:subset)
6 │ 11 12 missing
7 │ 1.0 missing 1.0
8 │ missing missing missing

julia> push!(DataFrame(a=1, b=2), (3, 4), (b=6, a=5))
3×2 DataFrame
Row │ a b
│ Int64 Int64
─────┼──────────────
1 │ 1 2
2 │ 3 4
3 │ 5 6
```
"""
Base.push!(df::DataFrame, row::Any; promote::Bool=false) =
_row_inserter!(df, -1, row, Val{:push}(), promote)
function Base.push!(df::DataFrame, row::Any;
cols=:setequal, promote::Bool=false)
if !(cols in (:setequal, :orderequal))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal or :setequal"))
end

return _row_inserter!(df, -1, row, Val{:push}(), promote)
end

"""
pushfirst!(df::DataFrame, row::Union{Tuple, AbstractArray}; promote::Bool=false)
pushfirst!(df::DataFrame, row::Union{Tuple, AbstractArray};
cols::Symbol=:setequal, promote::Bool=false)
pushfirst!(df::DataFrame, row::Union{DataFrameRow, NamedTuple, AbstractDict,
Tables.AbstractRow};
cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset]))
pushfirst!(df::DataFrame, rows...;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that I have reverted this part of your update of the docs. The reason is that rows... allows for mixing of the first and second style (named and unnamed rows) in the current design. I could disallow this if you prefer, see the comment https://github.com/JuliaData/DataFrames.jl/pull/3372/files#r1314234174

cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset]))
bkamins marked this conversation as resolved.
Show resolved Hide resolved

Add one row at the beginning of `df` in-place, taking the values from `row`.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
Several rows can be added by passing them as separate arguments from `rows`.

$INSERTION_COMMON

Expand Down Expand Up @@ -539,13 +633,30 @@ julia> pushfirst!(df, NamedTuple(), cols=:subset)
6 │ a 1 missing
7 │ b 2 missing
8 │ c 3 missing

julia> pushfirst!(DataFrame(a=1, b=2), (3, 4), (b=6, a=5))
3×2 DataFrame
Row │ a b
│ Int64 Int64
─────┼──────────────
1 │ 3 4
2 │ 5 6
3 │ 1 2
```
"""
Base.pushfirst!(df::DataFrame, row::Any; promote::Bool=false) =
_row_inserter!(df, -1, row, Val{:pushfirst}(), promote)
function Base.pushfirst!(df::DataFrame, row::Any;
cols=:setequal, promote::Bool=false)
if !(cols in (:setequal, :orderequal))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal or :setequal"))
end

return _row_inserter!(df, -1, row, Val{:pushfirst}(), promote)
end

"""
insert!(df::DataFrame, index::Integer, row::Union{Tuple, AbstractArray}; promote::Bool=false)
insert!(df::DataFrame, index::Integer, row::Union{Tuple, AbstractArray};
cols::Symbol=:setequal, promote::Bool=false)
insert!(df::DataFrame, index::Integer, row::Union{DataFrameRow, NamedTuple,
AbstractDict, Tables.AbstractRow};
cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset]))
Expand Down Expand Up @@ -629,7 +740,12 @@ julia> insert!(df, 3, NamedTuple(), cols=:subset)
8 │ 1.0 missing 1.0
```
"""
function Base.insert!(df::DataFrame, index::Integer, row::Any; promote::Bool=false)
function Base.insert!(df::DataFrame, index::Integer, row::Any;
cols=:setequal, promote::Bool=false)
if !(cols in (:setequal, :orderequal))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal or :setequal"))
end
index isa Bool && throw(ArgumentError("invalid index: $index of type Bool"))
1 <= index <= nrow(df)+1 ||
throw(ArgumentError("invalid index: $index for data frame with $(nrow(df)) rows"))
Expand Down Expand Up @@ -986,3 +1102,26 @@ function _row_inserter!(df::DataFrame, loc::Integer,
_drop_all_nonnote_metadata!(df)
return df
end

function Base.push!(df::DataFrame, @nospecialize rows...;
cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if !(cols in (:orderequal, :setequal, :intersect, :subset, :union))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal, :setequal, :intersect, :subset or :union)"))
end

return foldl((df, row) -> push!(df, row, cols=cols, promote=promote),
collect(Any, rows), init=df)
end

function Base.pushfirst!(df::DataFrame, @nospecialize rows...;
cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if !(cols in (:orderequal, :setequal, :intersect, :subset, :union))
throw(ArgumentError("`cols` keyword argument must be " *
":orderequal, :setequal, :intersect, :subset or :union)"))
end
return foldr((row, df) -> pushfirst!(df, row, cols=cols, promote=promote),
collect(Any, rows), init=df)
end
20 changes: 1 addition & 19 deletions src/other/tables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,31 +63,13 @@ function DataFrame(x; copycols::Union{Nothing, Bool}=nothing)
end

# the logic here relies on the fact that Tables.CopiedColumns
# is the only exception for default copycols value
# is the only exception for default copycols value
DataFrame(x, cnames::AbstractVector; makeunique::Bool=false,
copycols::Union{Nothing, Bool}=nothing) =
rename!(DataFrame(x, copycols=something(copycols, !(x isa Tables.CopiedColumns))),
_name2symbol(cnames),
makeunique=makeunique)

function Base.append!(df::DataFrame, table; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if table isa Dict && cols == :orderequal
throw(ArgumentError("passing `Dict` as `table` when `cols` is equal to " *
"`:orderequal` is not allowed as it is unordered"))
end
append!(df, DataFrame(table, copycols=false), cols=cols, promote=promote)
end

function Base.prepend!(df::DataFrame, table; cols::Symbol=:setequal,
promote::Bool=(cols in [:union, :subset]))
if table isa Dict && cols == :orderequal
throw(ArgumentError("passing `Dict` as `table` when `cols` is equal to " *
"`:orderequal` is not allowed as it is unordered"))
end
prepend!(df, DataFrame(table, copycols=false), cols=cols, promote=promote)
end

# This supports the Tables.RowTable type; needed to avoid ambiguities w/ another constructor
DataFrame(x::AbstractVector{NamedTuple{names, T}}; copycols::Bool=true) where {names, T} =
fromcolumns(Tables.columns(Tables.IteratorWrapper(x)), collect(names), copycols=false)
Expand Down
Loading
Loading