Skip to content

Commit

Permalink
allow scalar broadcasting into an empty data frame and correct broadc…
Browse files Browse the repository at this point in the history
…asting into a cell of a data frame
  • Loading branch information
bkamins authored Jul 25, 2019
1 parent 9f43191 commit ed25099
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 43 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DataFrames"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "0.19.0"
version = "0.19.1"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
6 changes: 4 additions & 2 deletions docs/src/lib/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ In particular a description explicitly mentions if the assignment is *in-place*.
(with the exception that if `v` is an `AbstractRange` it gets converted to a `Vector`);
also if `col` is a `Symbol` that is not present in `df` then a new column in `df` is created and holds `v`;
equivalent to `df.col = v` if `col` is a valid identifier;
this is allowed if `ncol(df) == 0 || length(v) == nrow(df)`;
* `df[!, cols] = v` -> is currently disallowed, but is planned to be supported in the future;

Note that only `df[!, col] = v` and `df.col = v` can be used to add a new column to a `DataFrame`.
Expand Down Expand Up @@ -172,9 +173,10 @@ In such an operation `AbstractDataFrame` is considered as two-dimensional and `D
`DataFrameRow` is considered to be column-oriented.

Additional rules:
* in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` and `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` syntaxes `v` is broadcasted into the contents of `df[row, col]` (this is consistent with Julia Base);
* in the `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[rows, col] .= v` and `df[rows, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` and it is missing from `df` then a new column is added;
* in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` and it is missing from `df` then a new column is added; the length of the column is always the value of `nrow(df)` before the assignment takes place;
* `df[!, cols] = v` syntax is currently disallowed, but is planned to be supported in the future;
* `df.col .= v` syntax is allowed and performs in-place assignment to an existing vector `df.col`.
* in the `sdf[CartesianIndex(row, col)] .= v`, `sdf[row, col] .= v` and `sdf[row, cols] .= v` syntaxes the assignment to `sdf` is performed in-place;
Expand Down
21 changes: 9 additions & 12 deletions src/other/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ struct LazyNewColDataFrame{T}
col::T
end

# we allow LazyNewColDataFrame only for data frames with at least one column
Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),)

# ColReplaceDataFrame is reserved for future extensions if we decide to allow df[!, cols] .= v
Expand All @@ -90,15 +89,16 @@ Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),)

# Base.axes(x::ColReplaceDataFrame) = axes(x.df)

Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = view(df, idx[1], idx[2])
Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = df[idx]
Base.maybeview(df::AbstractDataFrame, row::Integer, col::ColumnIndex) = df[row, col]
Base.maybeview(df::AbstractDataFrame, rows, cols) = view(df, rows, cols)

function Base.maybeview(df::DataFrame, ::typeof(!), cols)
if !(cols isa ColumnIndex)
throw(ArgumentError("broadcasting with column replacement is currently allowed only for single column index"))
end
if ncol(df) == 0
throw(ArgumentError("broadcasting into a data frame with no columns is not allowed"))
if !(cols isa Symbol) && cols > ncol(df)
throw(ArgumentError("creating new columns using an integer index by broadcasting is disallowed"))
end
# in the future we might allow cols to target multiple columns
# in which case ColReplaceDataFrame(df, index(df)[cols]) will be returned
Expand All @@ -108,14 +108,11 @@ end
Base.maybeview(df::SubDataFrame, ::typeof(!), idxs) =
throw(ArgumentError("broadcasting with ! row selector is not allowed for SubDataFrame"))

function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted)
if isempty(lazydf.df)
throw(ArgumentError("creating a column via broadcasting is not allowed on empty data frames"))
end
if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} &&
bc.f === identity && bc.args isa Tuple{Any} && Base.Broadcast.isflat(bc)
T = typeof(bc.args[1][])
col = similar(Vector{T}, nrow(lazydf.df))
function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted{T}) where T
if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}}
bc_tmp = Base.Broadcast.Broadcasted{T}(bc.f, bc.args, ())
v = Base.Broadcast.materialize(bc_tmp)
col = similar(Vector{typeof(v)}, nrow(lazydf.df))
copyto!(col, bc)
else
col = Base.Broadcast.materialize(bc)
Expand Down
124 changes: 96 additions & 28 deletions test/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -598,14 +598,67 @@ end
@test_throws BoundsError dfr[10] .= ones(3)
@test_throws ArgumentError dfr[:z] .= ones(3)
@test df == cdf

df = DataFrame()
@test_throws DimensionMismatch df[!, :a] .= sin.(1:3)
df[!, :b] .= sin.(1)
df[!, :c] .= sin(1) .+ 1
@test df == DataFrame(b=Float64[], c=Float64[])
end

@testset "empty data frame corner case" begin
df = DataFrame()
@test_throws ArgumentError df[!, 1] .= 1
@test_throws ArgumentError df[!, :a] .= [1]
@test_throws ArgumentError df[!, [:a,:b]] .= [1]
@test df == DataFrame()
@test_throws ArgumentError df[!, 2] .= 1
@test_throws ArgumentError df[!, [:a, :b]] .= [1]
@test_throws ArgumentError df[!, [:a, :b]] .= 1
@test_throws DimensionMismatch df[!, :a] .= [1 2]
@test_throws DimensionMismatch df[!, :a] .= [1, 2]
@test_throws DimensionMismatch df[!, :a] .= sin.(1) .+ [1, 2]

for rhs in [1, [1], Int[], "abc", ["abc"]]
df = DataFrame()
df[!, :a] .= rhs
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == (rhs isa AbstractVector ? eltype(rhs) : typeof(rhs))

df = DataFrame()
df[!, :a] .= length.(rhs)
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == Int

df = DataFrame()
df[!, :a] .= length.(rhs) .+ 1
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == Int

df = DataFrame()
@. df[!, :a] = length(rhs) + 1
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == Int

df = DataFrame(x=Int[])
df[!, :a] .= rhs
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == (rhs isa AbstractVector ? eltype(rhs) : typeof(rhs))

df = DataFrame(x=Int[])
df[!, :a] .= length.(rhs)
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == Int

df = DataFrame(x=Int[])
df[!, :a] .= length.(rhs) .+ 1
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == Int

df = DataFrame(x=Int[])
@. df[!, :a] = length(rhs) + 1
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == Int
end

df = DataFrame()
df .= 1
@test df == DataFrame()
df .= [1]
Expand All @@ -615,11 +668,22 @@ end
@test_throws DimensionMismatch df .= ones(1,2)
@test_throws DimensionMismatch df .= ones(1,1,1)

@test_throws ArgumentError df[!, :a] .= 1
@test_throws ArgumentError df[!, [:a, :b]] .= 1

df = DataFrame(a=[])
@test_throws ArgumentError df[!, :b] .= 1
df[!, :b] .= sin.(1)
@test eltype(df.b) == Float64
df[!, :b] .= [1]
@test eltype(df.b) == Int
df[!, :b] .= 'a'
@test eltype(df.b) == Char
@test names(df) == [:a, :b]

c = categorical(["a", "b", "c"])
df = DataFrame()
@test_throws DimensionMismatch df[!, :a] .= c

df[!, :b] .= c[1]
@test nrow(df) == 0
@test df.b isa CategoricalVector{String}
end

@testset "test categorical values" begin
Expand Down Expand Up @@ -694,9 +758,11 @@ end

@testset "scalar on assignment side" begin
df = DataFrame(rand(2, 3))
df[1, 1] .= df[1, 1] .- df[1, 1]
@test_throws MethodError df[1, 1] .= df[1, 1] .- df[1, 1]
df[1, 1:1] .= df[1, 1] .- df[1, 1]
@test df[1, 1] == 0
df[1, 2] .-= df[1, 2]
@test_throws MethodError df[1, 2] .-= df[1, 2]
df[1:1, 2] .-= df[1, 2]
@test df[1, 2] == 0
end

Expand Down Expand Up @@ -940,26 +1006,20 @@ end
@testset "additional checks of post-! broadcasting rules" begin
df = copy(refdf)
v1 = df[!, 1]
df[CartesianIndex(1, 1)] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[CartesianIndex(1, 1)] .= 1
@test_throws MethodError df[CartesianIndex(1, 1)] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[CartesianIndex(1, 1)] .= [1,2]

df = copy(refdf)
v1 = df[!, 1]
df[1, 1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, 1] .= 1
@test_throws MethodError df[1, 1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, 1] .= [1, 2]

df = copy(refdf)
v1 = df[!, 1]
df[1, :x1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, :x1] .= 1
@test_throws MethodError df[1, :x1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, :x1] .= [1, 2]

df = copy(refdf)
Expand Down Expand Up @@ -1088,7 +1148,7 @@ end
@test df == refdf
@test_throws ArgumentError df[!, 10] .= [1,2,3]
@test df == refdf
@test_throws DimensionMismatch df[!, 10] .= [1 2 3]
@test_throws ArgumentError df[!, 10] .= [1 2 3]
@test df == refdf

df = copy(refdf)
Expand All @@ -1110,26 +1170,20 @@ end

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[CartesianIndex(1, 1)] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[CartesianIndex(1, 1)] .= 1
@test_throws MethodError df[CartesianIndex(1, 1)] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[CartesianIndex(1, 1)] .= [1,2]

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[1, 1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, 1] .= 1
@test_throws MethodError df[1, 1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, 1] .= [1, 2]

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[1, :x1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, :x1] .= 1
@test_throws MethodError df[1, :x1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, :x1] .= [1, 2]

df = view(copy(refdf), :, :)
Expand Down Expand Up @@ -1277,4 +1331,18 @@ end
@test df.a !== a
end

@testset "add new correct rules for df[row, col] .= v broadcasting" begin
df = DataFrame(a=1)
@test_throws MethodError df[1,1] .= 10
@test_throws MethodError df[1,:a] .= 10
@test_throws MethodError df[CartesianIndex(1,1)] .= 10
df = DataFrame(a=[[1,2,3]])
df[1,1] .= 10
@test df == DataFrame(a=[[10,10,10]])
df[1,:a] .= 100
@test df == DataFrame(a=[[100,100,100]])
df[CartesianIndex(1,1)] .= 1000
@test df == DataFrame(a=[[1000,1000,1000]])
end

end # module

0 comments on commit ed25099

Please sign in to comment.