From 662b89fa0dc7eb064b471266049ce525db6177d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 30 Mar 2021 11:13:06 +0200 Subject: [PATCH 1/4] check for data frame corruption in deleteat! --- src/dataframe/dataframe.jl | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3599c8369e..bf6586479e 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -966,9 +966,18 @@ function Base.delete!(df::DataFrame, inds) if !isempty(inds) && size(df, 2) == 0 throw(BoundsError(df, (inds, :))) end + # we require ind to be stored and unique like in Base # otherwise an error will be thrown and the data frame will get corrupted - foreach(col -> deleteat!(col, inds), _columns(df)) + n = nrow(df) + for col in _columns(df) + if length(col) != n + throw(ArgumentError("Unequal lengths of columns detected. Most likely the " * + "same column was stored in the data frame several times. + The source data frame is now corrupted.")) + end + deleteat!(col, inds) + end return df end @@ -977,7 +986,16 @@ function Base.delete!(df::DataFrame, inds::AbstractVector{Bool}) throw(BoundsError(df, (inds, :))) end drop = findall(inds) - foreach(col -> deleteat!(col, drop), _columns(df)) + + n = nrow(df) + for col in _columns(df) + if length(col) != n + throw(ArgumentError("Unequal lengths of columns detected. Most likely the " * + "same column was stored in the data frame several times. + The source data frame is now corrupted.")) + end + deleteat!(col, inds) + end return df end From daa576e32cdbd70873ac0fc83d7726bbb3dc78a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 30 Mar 2021 11:16:10 +0200 Subject: [PATCH 2/4] add tests --- test/dataframe.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/dataframe.jl b/test/dataframe.jl index 413b6866e6..35bd71b633 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -613,6 +613,12 @@ end @test delete!(df, v) == DataFrame(x=[2, 3]) @test x == [2, 3] end + + for inds in (1, [1], [true, false]) + df = DataFrame(x1=[1, 2]) + df.x2 = df.x1 + @test_throws ArgumentError delete!(df, inds) + end end @testset "describe" begin From 820c831294dd161b2cbc47987b31e49504c559a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 30 Mar 2021 14:10:13 +0200 Subject: [PATCH 3/4] improve delete! --- NEWS.md | 2 ++ src/dataframe/dataframe.jl | 43 +++++++++++++++++++++----------------- test/dataframe.jl | 5 +++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index 997a18213e..d1ca8e2b7f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,8 @@ selected as a property (e.g. `df.col .= 1`) is allowed when column does not exist and it allocates a fresh column ([#2655](https://github.com/JuliaData/DataFrames.jl/pull/2655)) +* `delete!` now correctly handles the case when columns of a data frame are aliased + ([#2690](https://github.com/JuliaData/DataFrames.jl/pull/2690)) ## Deprecated diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index bf6586479e..d3979988d9 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -966,19 +966,10 @@ function Base.delete!(df::DataFrame, inds) if !isempty(inds) && size(df, 2) == 0 throw(BoundsError(df, (inds, :))) end - + # we require ind to be stored and unique like in Base # otherwise an error will be thrown and the data frame will get corrupted - n = nrow(df) - for col in _columns(df) - if length(col) != n - throw(ArgumentError("Unequal lengths of columns detected. Most likely the " * - "same column was stored in the data frame several times. - The source data frame is now corrupted.")) - end - deleteat!(col, inds) - end - return df + return _delete!_helper(df, inds) end function Base.delete!(df::DataFrame, inds::AbstractVector{Bool}) @@ -986,21 +977,35 @@ function Base.delete!(df::DataFrame, inds::AbstractVector{Bool}) throw(BoundsError(df, (inds, :))) end drop = findall(inds) + return _delete!_helper(df, drop) +end + +Base.delete!(df::DataFrame, inds::Not) = delete!(df, axes(df, 1)[inds]) + +function _delete!_helper(df::DataFrame, drop) + cols = _columns(df) + isempty(cols) && return df n = nrow(df) - for col in _columns(df) - if length(col) != n - throw(ArgumentError("Unequal lengths of columns detected. Most likely the " * - "same column was stored in the data frame several times. - The source data frame is now corrupted.")) + col1 = cols[1] + deleteat!(col1, drop) + newn = length(col1) + + for i in 2:length(cols) + col = cols[i] + if length(col) == n + deleteat!(col, drop) end - deleteat!(col, inds) end + + for i in 1:length(cols) + # this should never happen, but we add it for safety + @assert length(cols[i]) == newn corrupt_msg(df, i) + end + return df end -Base.delete!(df::DataFrame, inds::Not) = delete!(df, axes(df, 1)[inds]) - """ empty!(df::DataFrame) diff --git a/test/dataframe.jl b/test/dataframe.jl index 35bd71b633..e34259ef52 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -613,11 +613,12 @@ end @test delete!(df, v) == DataFrame(x=[2, 3]) @test x == [2, 3] end - + for inds in (1, [1], [true, false]) df = DataFrame(x1=[1, 2]) df.x2 = df.x1 - @test_throws ArgumentError delete!(df, inds) + @test delete!(df, inds) === df + @test df == DataFrame(x1=[2], x2=[2]) end end From 3d6979f2c9afd6632743ca43174e5a01f88a5c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 6 Apr 2021 23:12:30 +0200 Subject: [PATCH 4/4] one more test --- test/dataframe.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/dataframe.jl b/test/dataframe.jl index e34259ef52..68dffb7b6e 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -620,6 +620,10 @@ end @test delete!(df, inds) === df @test df == DataFrame(x1=[2], x2=[2]) end + + df = DataFrame(a=1, b=2) + push!(df.b, 3) + @test_throws AssertionError delete!(df, 1) end @testset "describe" begin