From 316c31484c5025aae7c25d044168033c23e51a30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 14 Mar 2023 23:27:57 +0100 Subject: [PATCH 01/12] add support for Not with multiple indices --- NEWS.md | 8 ++++++++ Project.toml | 2 +- docs/src/lib/indexing.md | 1 + src/other/index.jl | 2 ++ test/index.jl | 8 ++++++++ test/indexing.jl | 9 +++++++++ 6 files changed, 29 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 86721af196..2c3133c19b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# DataFrames.jl v1.6 Release Notes + +## New functionalities + +* `Not` allows for passing multiple postiional arguments that are + treated as if they were wrapped in `Cols` + ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) + # DataFrames.jl v1.5 Release Notes ## New functionalities diff --git a/Project.toml b/Project.toml index cc56cb70b2..464978361f 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ CategoricalArrays = "0.10.0" Compat = "4.2" DataAPI = "1.14.0" InlineStrings = "1.3.0" -InvertedIndices = "1" +InvertedIndices = "1.3" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2, 1" PooledArrays = "1.4.2" diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 326b3e6d40..ed2ca488d6 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -33,6 +33,7 @@ The following values are a valid column index: * a `Not` expression (see [InvertedIndices.jl](https://github.com/JuliaData/InvertedIndices.jl)); `Not(idx)` selects all indices not in the passed `idx`; + when passed as column selector `Not(idx...)` is equivalent to `Not(Cols(idx...))`. * a `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); `Cols(idxs...)` selects the union of the selections in `idxs`; in particular `Cols()` diff --git a/src/other/index.jl b/src/other/index.jl index b444bbdd9a..8daadca1c6 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -226,6 +226,8 @@ end @inline Base.getindex(x::AbstractIndex, ::Colon) = Base.OneTo(length(x)) @inline Base.getindex(x::AbstractIndex, notidx::Not) = setdiff(1:length(x), getindex(x, notidx.skip)) +@inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = + setdiff(1:length(x), getindex(x, Cols(notidx.skip.indices...))) @inline Base.getindex(x::AbstractIndex, idx::Between) = x[idx.first]:x[idx.last] @inline Base.getindex(x::AbstractIndex, idx::All) = isempty(idx.cols) ? (1:length(x)) : throw(ArgumentError("All(args...) is not supported: use Cols(args...) instead")) diff --git a/test/index.jl b/test/index.jl index a9b5664fdd..a9b652b956 100644 --- a/test/index.jl +++ b/test/index.jl @@ -115,6 +115,9 @@ end si7 = SubIndex(i, Not(1:2)) si8 = SubIndex(i, ["C", "D", "E"]) si9 = SubIndex(i, Not(Not(["C", "D", "E"]))) + si10 = SubIndex(i, Not(1, 2)) + si11 = SubIndex(i, Not(:A, :B)) + si12 = SubIndex(i, Not(2, "A")) @test copy(si1) == i @test copy(si2) == Index([:C, :D, :E]) @@ -125,6 +128,9 @@ end @test copy(si7) == Index([:C, :D, :E]) @test copy(si8) == Index([:C, :D, :E]) @test copy(si9) == Index([:C, :D, :E]) + @test copy(si10) == Index([:C, :D, :E]) + @test copy(si11) == Index([:C, :D, :E]) + @test copy(si12) == Index([:C, :D, :E]) @test_throws ArgumentError SubIndex(i, 1) @test_throws ArgumentError SubIndex(i, :A) @@ -327,6 +333,8 @@ end push!(i, :x131) push!(i, :y13) push!(i, :yy13) + @test i[Not(2, 4, 5)] == [1, 3] + @test i[Not(2, :y13, "yy13")] == [1, 3] @test i[Not(Not(r"x1."))] == [2, 3] @test isempty(i[Not(Not(r"xx"))]) @test i[Not(Not(r""))] == 1:5 diff --git a/test/indexing.jl b/test/indexing.jl index 13a1890f83..7660a9601b 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -19,6 +19,15 @@ using Test, DataFrames, Unicode, Random @test dfx[!, 1] === df[!, names(dfx)[1]] end + @test df[!, Not(1, 2)] == DataFrame(c=7:9) + @test df[!, Not(:b, 1)] == DataFrame(c=7:9) + @test df[!, Not("c", :a)] == DataFrame(b=4:6) + @test df[!, Not(:b, "c", :a)] == DataFrame() + @test df[!, Not([1, 2], :b)] == DataFrame(c=7:9) + @test df[!, Not([:c, :a], :b)] == DataFrame() + @test df[!, Not([1, 2], 2)] == DataFrame(c=7:9) + @test df[!, Not([1, 2], [1, 2])] == DataFrame(c=7:9) + @test df[1, 1] == 1 @test df[1, 1:2] isa DataFrameRow @test df[1, r"[ab]"] isa DataFrameRow From 940a09056296f4fab9bf37b6026bfe7a86c01ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Mar 2023 22:42:23 +0200 Subject: [PATCH 02/12] Update NEWS.md Co-authored-by: Milan Bouchet-Valat --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 2c3133c19b..2eb0d2c391 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ ## New functionalities -* `Not` allows for passing multiple postiional arguments that are +* `Not` allows passing multiple positional arguments that are treated as if they were wrapped in `Cols` ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) From 24ac22e79dfe16d763535d934db3bc9e0b37815e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 29 Mar 2023 18:07:31 +0200 Subject: [PATCH 03/12] consider one more corner case --- docs/src/lib/indexing.md | 4 +++- test/indexing.jl | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index ed2ca488d6..36bd1fd2b8 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -33,7 +33,9 @@ The following values are a valid column index: * a `Not` expression (see [InvertedIndices.jl](https://github.com/JuliaData/InvertedIndices.jl)); `Not(idx)` selects all indices not in the passed `idx`; - when passed as column selector `Not(idx...)` is equivalent to `Not(Cols(idx...))`. + when passed as column selector `Not(idx...)` is equivalent to + `Not([idx...])` if all indices are integers or to + `Not(Cols(idx...))` otherwise. * a `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); `Cols(idxs...)` selects the union of the selections in `idxs`; in particular `Cols()` diff --git a/test/indexing.jl b/test/indexing.jl index 7660a9601b..c7dd69d110 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -20,8 +20,11 @@ using Test, DataFrames, Unicode, Random end @test df[!, Not(1, 2)] == DataFrame(c=7:9) + @test_throws ArgumentError df[!, Not(1, 1, 2)] @test df[!, Not(:b, 1)] == DataFrame(c=7:9) + @test df[!, Not(:b, :b, 1)] == DataFrame(c=7:9) @test df[!, Not("c", :a)] == DataFrame(b=4:6) + @test df[!, Not("c", "c", :a)] == DataFrame(b=4:6) @test df[!, Not(:b, "c", :a)] == DataFrame() @test df[!, Not([1, 2], :b)] == DataFrame(c=7:9) @test df[!, Not([:c, :a], :b)] == DataFrame() From de52258cc9e2f7bd63ec33301d5bb4c89c08117b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Apr 2023 20:42:11 +0200 Subject: [PATCH 04/12] make `Not` de-duplcate vectors of indices --- NEWS.md | 3 ++- src/other/index.jl | 4 +++- test/indexing.jl | 7 ++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2eb0d2c391..d37e939d85 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,8 @@ ## New functionalities * `Not` allows passing multiple positional arguments that are - treated as if they were wrapped in `Cols` + treated as if they were wrapped in `Cols` and does not throw an error + when a vector of duplicate indices is passed when doing column selection ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) # DataFrames.jl v1.5 Release Notes diff --git a/src/other/index.jl b/src/other/index.jl index 8daadca1c6..af07db662f 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -226,7 +226,9 @@ end @inline Base.getindex(x::AbstractIndex, ::Colon) = Base.OneTo(length(x)) @inline Base.getindex(x::AbstractIndex, notidx::Not) = setdiff(1:length(x), getindex(x, notidx.skip)) -@inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = +@inline Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) = + setdiff(1:length(x), getindex(x, unique(notidx.skip))) + @inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = setdiff(1:length(x), getindex(x, Cols(notidx.skip.indices...))) @inline Base.getindex(x::AbstractIndex, idx::Between) = x[idx.first]:x[idx.last] @inline Base.getindex(x::AbstractIndex, idx::All) = diff --git a/test/indexing.jl b/test/indexing.jl index c7dd69d110..f4b01e2de0 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -20,11 +20,16 @@ using Test, DataFrames, Unicode, Random end @test df[!, Not(1, 2)] == DataFrame(c=7:9) - @test_throws ArgumentError df[!, Not(1, 1, 2)] + @test df[!, Not(1, 1, 2)] == DataFrame(c=7:9) + @test df[!, Not([1, 1, 2])] == DataFrame(c=7:9) @test df[!, Not(:b, 1)] == DataFrame(c=7:9) @test df[!, Not(:b, :b, 1)] == DataFrame(c=7:9) @test df[!, Not("c", :a)] == DataFrame(b=4:6) @test df[!, Not("c", "c", :a)] == DataFrame(b=4:6) + @test df[!, Not(:c, :c, :a)] == DataFrame(b=4:6) + @test df[!, Not([:c, :c, :a])] == DataFrame(b=4:6) + @test df[!, Not("c", "c", "a")] == DataFrame(b=4:6) + @test df[!, Not(["c", "c", "a"])] == DataFrame(b=4:6) @test df[!, Not(:b, "c", :a)] == DataFrame() @test df[!, Not([1, 2], :b)] == DataFrame(c=7:9) @test df[!, Not([:c, :a], :b)] == DataFrame() From 547f911e27fc962e147ce71bc1d5557718d266b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Apr 2023 20:42:52 +0200 Subject: [PATCH 05/12] fix indent --- src/other/index.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/other/index.jl b/src/other/index.jl index af07db662f..26111a6314 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -228,7 +228,7 @@ end setdiff(1:length(x), getindex(x, notidx.skip)) @inline Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) = setdiff(1:length(x), getindex(x, unique(notidx.skip))) - @inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = +@inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = setdiff(1:length(x), getindex(x, Cols(notidx.skip.indices...))) @inline Base.getindex(x::AbstractIndex, idx::Between) = x[idx.first]:x[idx.last] @inline Base.getindex(x::AbstractIndex, idx::All) = From bd8e8da871c2d903bf310fa14151bf4156a6cf6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Apr 2023 23:15:01 +0200 Subject: [PATCH 06/12] correctly handle case of Bool --- src/other/index.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/other/index.jl b/src/other/index.jl index 26111a6314..ee441bf7cb 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -226,9 +226,13 @@ end @inline Base.getindex(x::AbstractIndex, ::Colon) = Base.OneTo(length(x)) @inline Base.getindex(x::AbstractIndex, notidx::Not) = setdiff(1:length(x), getindex(x, notidx.skip)) -@inline Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) = - setdiff(1:length(x), getindex(x, unique(notidx.skip))) -@inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = + +function @inline Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) + skip = notidx.skip + todrop = getindex(x, eltype(skip) === Bool ? skip : unique(skip)) + return setdiff(1:length(x), todrop) +end + @inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = setdiff(1:length(x), getindex(x, Cols(notidx.skip.indices...))) @inline Base.getindex(x::AbstractIndex, idx::Between) = x[idx.first]:x[idx.last] @inline Base.getindex(x::AbstractIndex, idx::All) = From bfcb00eb412c299101a134ac01fdf69b1d79375a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Apr 2023 23:17:13 +0200 Subject: [PATCH 07/12] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/lib/indexing.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 36bd1fd2b8..89435caada 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -34,8 +34,7 @@ The following values are a valid column index: [InvertedIndices.jl](https://github.com/JuliaData/InvertedIndices.jl)); `Not(idx)` selects all indices not in the passed `idx`; when passed as column selector `Not(idx...)` is equivalent to - `Not([idx...])` if all indices are integers or to - `Not(Cols(idx...))` otherwise. + `Not(Cols(idx...))`. * a `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); `Cols(idxs...)` selects the union of the selections in `idxs`; in particular `Cols()` From b7a0322ff37e9742bbe29fa07bb2d795c5714ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Apr 2023 23:18:03 +0200 Subject: [PATCH 08/12] fix code layout --- src/other/index.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/other/index.jl b/src/other/index.jl index ee441bf7cb..1047a496dc 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -232,7 +232,8 @@ function @inline Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) todrop = getindex(x, eltype(skip) === Bool ? skip : unique(skip)) return setdiff(1:length(x), todrop) end - @inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = + +@inline Base.getindex(x::AbstractIndex, notidx::Not{InvertedIndices.NotMultiIndex}) = setdiff(1:length(x), getindex(x, Cols(notidx.skip.indices...))) @inline Base.getindex(x::AbstractIndex, idx::Between) = x[idx.first]:x[idx.last] @inline Base.getindex(x::AbstractIndex, idx::All) = From 3c204f052aabc7697d64fef7f0ff163422ccc045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Apr 2023 23:23:01 +0200 Subject: [PATCH 09/12] fix @inline location --- src/other/index.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/other/index.jl b/src/other/index.jl index 1047a496dc..51aa3a31cc 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -227,7 +227,7 @@ end @inline Base.getindex(x::AbstractIndex, notidx::Not) = setdiff(1:length(x), getindex(x, notidx.skip)) -function @inline Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) +@inline function Base.getindex(x::AbstractIndex, notidx::Not{<:AbstractVector}) skip = notidx.skip todrop = getindex(x, eltype(skip) === Bool ? skip : unique(skip)) return setdiff(1:length(x), todrop) From 2e305389a73b9a4f1328c6bd355197a87be45bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 3 Apr 2023 23:35:10 +0200 Subject: [PATCH 10/12] fix old tests --- test/index.jl | 10 +++++++--- test/select.jl | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test/index.jl b/test/index.jl index a9b652b956..fc82540eab 100644 --- a/test/index.jl +++ b/test/index.jl @@ -50,9 +50,13 @@ using DataFrames: Index, SubIndex, fuzzymatch @test_throws ArgumentError i[Not(:x)] @test_throws ArgumentError i[Not("x")] @test_throws BoundsError i[Not(1:3)] - @test_throws ArgumentError i[Not([1, 1])] - @test_throws ArgumentError i[Not([:A, :A])] - @test_throws ArgumentError i[Not(["A", "A"])] + + @test i[Not([1, 1])] == [2] + @test i[Not([:A, :A])] == [2] + @test i[Not(["A", "A"])] == [2] + @test isempty(i[Not([true, true])]) + @test i[Not([false, false])] == 1:2 + @test i[Not([true, false])] == [2] @test i[1:1] == 1:1 diff --git a/test/select.jl b/test/select.jl index 2f46f15718..40a2e6a2b7 100644 --- a/test/select.jl +++ b/test/select.jl @@ -20,10 +20,11 @@ Random.seed!(1234) df = DataFrame(a=1, b=2, c=3, d=4, e=5) @test_throws BoundsError select!(df, Not(0)) @test_throws BoundsError select!(df, Not(6)) - @test_throws ArgumentError select!(df, Not([1, 1])) @test_throws ArgumentError select!(df, Not(:f)) @test_throws BoundsError select!(df, Not([true, false])) + @test select!(copy(df), Not([1, 1])) == df[!, 2:end] + d = copy(df) select!(d, Not([:a, :e, :c])) @test d == DataFrame(b=2, d=4) From 1dc5b0ff756f877a138255857bd564cda036d4c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 4 Apr 2023 09:03:14 +0200 Subject: [PATCH 11/12] another test fix --- test/select.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/select.jl b/test/select.jl index 40a2e6a2b7..0793ba368e 100644 --- a/test/select.jl +++ b/test/select.jl @@ -64,10 +64,11 @@ end df = DataFrame(a=1, b=2, c=3, d=4, e=5) @test_throws BoundsError select(df, Not(0)) @test_throws BoundsError select(df, Not(6)) - @test_throws ArgumentError select(df, Not([1, 1])) @test_throws ArgumentError select(df, Not(:f)) @test_throws BoundsError select(df, Not([true, false])) + @test select(df, Not([1, 1])) == df[!, 2:end] + df2 = copy(df) d = select(df, Not([:a, :e, :c])) @test d == df[:, [:b, :d]] From 338bfe56b3445fa0d46abee731628b2b5684124a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 4 Apr 2023 10:16:31 +0200 Subject: [PATCH 12/12] another old test fix --- test/select.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/select.jl b/test/select.jl index 0793ba368e..ee57cfaf6a 100644 --- a/test/select.jl +++ b/test/select.jl @@ -153,10 +153,11 @@ end df = view(DataFrame(a=1, b=2, c=3, d=4, e=5), :, :) @test_throws BoundsError select(df, Not(0)) @test_throws BoundsError select(df, Not(6)) - @test_throws ArgumentError select(df, Not([1, 1])) @test_throws ArgumentError select(df, Not(:f)) @test_throws BoundsError select(df, Not([true, false])) + @test select(df, Not([1, 1])) == df[!, 2:end] + df2 = copy(df) d = select(df, Not([:a, :e, :c])) @test d isa DataFrame