From 5cadf594b8b45cebdd8f86eb58cb1ac7d609e030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 13 Sep 2021 22:24:28 +0200 Subject: [PATCH 1/9] Allow predicate in Cols --- NEWS.md | 2 ++ src/other/index.jl | 11 +++++++++++ test/index.jl | 5 +++++ 3 files changed, 18 insertions(+) diff --git a/NEWS.md b/NEWS.md index 07cce6bb69..bc0133fb13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,6 +70,8 @@ * the `DataFrame` constructor when matrix is passed to it as a first argument now allows `copycols` keyword argument ([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2859)) +* `Cols` now accepts a predicate accepting column names as strings. + ([#2880](https://github.com/JuliaData/DataFrames.jl/pull/2880)) ## Bug fixes diff --git a/src/other/index.jl b/src/other/index.jl index 3110a8af5a..547a1a3853 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -221,6 +221,17 @@ end isempty(idx.cols) ? (1:length(x)) : throw(ArgumentError("All(args...) is not supported: use Cols(args...) instead")) @inline Base.getindex(x::AbstractIndex, idx::Cols) = isempty(idx.cols) ? Int[] : union(getindex.(Ref(x), idx.cols)...) +@inline Base.getindex(x::AbstractIndex, idx::Cols{Tuple{typeof(:)}}) = x[:] +@inline Base.getindex(x::AbstractIndex, idx::Cols{<:Tuple{Function}}) = + findall(idx.cols[1], names(x)) + +""" + Cols(f::Function) + +Select the columns whose names passed to the `f` predicate as strings return `true`. +As a special case if `:` is passed (the `Colon` function) select all columns. +""" +Cols @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer}) if any(v -> v isa Bool, idx) diff --git a/test/index.jl b/test/index.jl index 5dbf1d5b17..6d4275a942 100644 --- a/test/index.jl +++ b/test/index.jl @@ -474,6 +474,11 @@ end df = DataFrame(a1=1, a2=2, b1=3, b2=4) @test df[:, Cols(r"a", Not(r"1"))] == df[:, [1, 2, 4]] @test df[:, Cols(Not(r"1"), r"a")] == df[:, [2, 4, 1]] + @test df[:, Cols(x -> x[1] == 'a')] == df[:, [1, 2]] + @test df[:, Cols(x -> x[end] == '1')] == df[:, [1, 3]] + @test df[:, Cols(x -> x[end] == '3')] == DataFrame() + @test_throws MethodError df[:, Cols(x -> true, 1)] == DataFrame() + @test_throws MethodError df[:, Cols(1, x -> true)] == DataFrame() end @testset "views" begin From 472201013bb82c4eb59d8681d319651f7a42d7e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 14 Sep 2021 09:44:30 +0200 Subject: [PATCH 2/9] add Cols to internals --- docs/src/lib/internals.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/lib/internals.md b/docs/src/lib/internals.md index 125314a6c1..326aaa8b67 100644 --- a/docs/src/lib/internals.md +++ b/docs/src/lib/internals.md @@ -16,4 +16,5 @@ getmaxwidths ourshow ourstrwidth @spawn_for_chunks +Cols ``` From bc29631e9f676a8d02dc7998ad163bec8b053b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 14 Sep 2021 23:24:30 +0200 Subject: [PATCH 3/9] remove special path for : --- src/other/index.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/other/index.jl b/src/other/index.jl index 547a1a3853..72828dde25 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -221,7 +221,6 @@ end isempty(idx.cols) ? (1:length(x)) : throw(ArgumentError("All(args...) is not supported: use Cols(args...) instead")) @inline Base.getindex(x::AbstractIndex, idx::Cols) = isempty(idx.cols) ? Int[] : union(getindex.(Ref(x), idx.cols)...) -@inline Base.getindex(x::AbstractIndex, idx::Cols{Tuple{typeof(:)}}) = x[:] @inline Base.getindex(x::AbstractIndex, idx::Cols{<:Tuple{Function}}) = findall(idx.cols[1], names(x)) From 8337dddc098eadb9d18ba71595e87cd6962fd729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 15 Sep 2021 16:49:47 +0200 Subject: [PATCH 4/9] Revert "remove special path for :" This reverts commit bc29631e9f676a8d02dc7998ad163bec8b053b47. --- src/other/index.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/other/index.jl b/src/other/index.jl index 72828dde25..547a1a3853 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -221,6 +221,7 @@ end isempty(idx.cols) ? (1:length(x)) : throw(ArgumentError("All(args...) is not supported: use Cols(args...) instead")) @inline Base.getindex(x::AbstractIndex, idx::Cols) = isempty(idx.cols) ? Int[] : union(getindex.(Ref(x), idx.cols)...) +@inline Base.getindex(x::AbstractIndex, idx::Cols{Tuple{typeof(:)}}) = x[:] @inline Base.getindex(x::AbstractIndex, idx::Cols{<:Tuple{Function}}) = findall(idx.cols[1], names(x)) From 92dfaa3bc671409f7909a127f6c242c5a2d28034 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 15 Sep 2021 17:14:04 +0200 Subject: [PATCH 5/9] improve documentation --- docs/src/lib/indexing.md | 12 +++++++- docs/src/man/working_with_dataframes.md | 39 ++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 386ab2150c..b02f062a0e 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -26,7 +26,17 @@ The rules for a valid type of index into a column are the following: * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`; * a regular expression, which gets expanded to a vector of matching column names; * a `Not` expression (see [InvertedIndices.jl](https://github.com/mbauman/InvertedIndices.jl)); - * an `Cols`, `All` or `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + the `Not(idx)` selects all indices not in the passed `idx`; + * an `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + the `Cols(idxs...)` selects the union of the selections in `idxs`; in particular + `Cols()` selects no columns and `Cols(:)` selects all columns; a special rule is + `Cols(predicate)`, where `precicate` is a predicate function; in this case + the columns whose names passed to the `predicate` predicate as strings return `true` + are selected. + * `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + the `Between(first, last)` selects the columns between `first` and `last`; + * `All` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + the `All()` selects all columns, equivalent to `:`; * a colon literal `:`. The rules for a valid type of index into a row are the following: diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index 6001ace2af..3125ec02be 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -255,11 +255,9 @@ julia> df[!, Not(:x1)] Finally, you can use `Not`, `Between`, `Cols` and `All` selectors in more complex column selection scenarios (note that `Cols()` selects no columns while `All()` selects all columns therefore `Cols` is a preferred selector if you -write generic code). The following examples move all columns whose names match -`r"x"` regular expression respectively to the front and to the end of a data -frame: +write generic code). Here are examples of using each of these selectors: -``` +```jldoctest dataframe julia> df = DataFrame(r=1, x1=2, x2=3, y=4) 1×4 DataFrame Row │ r x1 x2 y @@ -267,6 +265,39 @@ julia> df = DataFrame(r=1, x1=2, x2=3, y=4) ─────┼──────────────────────────── 1 │ 1 2 3 4 +julia> df[:, Not(:r)] # drop :r column +1×3 DataFrame + Row │ x1 x2 y + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 2 3 4 + +julia> df[:, Between(:r, :x2)] # keep columns between :r and :x2 +1×3 DataFrame + Row │ r x1 x2 + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 2 3 + +julia> df[:, All()] # keep all columns +1×4 DataFrame + Row │ r x1 x2 y + │ Int64 Int64 Int64 Int64 +─────┼──────────────────────────── + 1 │ 1 2 3 4 + +julia> df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x" +1×2 DataFrame + Row │ x1 x2 + │ Int64 Int64 +─────┼────────────── + 1 │ 2 3 +``` + +The following examples show a more complex use of `Cols` selector and move all +columns whose names match `r"x"` regular expression respectively to the front +and to the end of a data frame: +```jldoctest dataframe julia> df[:, Cols(r"x", :)] 1×4 DataFrame Row │ x1 x2 r y From 0f667b4821c16307631584b1d9b1f92ea798ed6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 20 Sep 2021 11:17:55 +0200 Subject: [PATCH 6/9] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/lib/indexing.md | 16 ++++++++-------- docs/src/man/working_with_dataframes.md | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index b02f062a0e..1f49e2f0d5 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -27,16 +27,16 @@ The rules for a valid type of index into a column are the following: * a regular expression, which gets expanded to a vector of matching column names; * a `Not` expression (see [InvertedIndices.jl](https://github.com/mbauman/InvertedIndices.jl)); the `Not(idx)` selects all indices not in the passed `idx`; - * an `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); - the `Cols(idxs...)` selects the union of the selections in `idxs`; in particular + * a `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + `Cols(idxs...)` selects the union of the selections in `idxs`; in particular `Cols()` selects no columns and `Cols(:)` selects all columns; a special rule is - `Cols(predicate)`, where `precicate` is a predicate function; in this case - the columns whose names passed to the `predicate` predicate as strings return `true` + `Cols(predicate)`, where `predicate` is a predicate function; in this case + the columns whose names passed to `predicate` as strings return `true` are selected. - * `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); - the `Between(first, last)` selects the columns between `first` and `last`; - * `All` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); - the `All()` selects all columns, equivalent to `:`; + * a `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + `Between(first, last)` selects the columns between `first` and `last`; + * an `All` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + `All()` selects all columns, equivalent to `:`; * a colon literal `:`. The rules for a valid type of index into a row are the following: diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index 3125ec02be..b7abc2d683 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -294,9 +294,9 @@ julia> df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts wit 1 │ 2 3 ``` -The following examples show a more complex use of `Cols` selector and move all +The following examples show a more complex use of the `Cols` selector, which moves all columns whose names match `r"x"` regular expression respectively to the front -and to the end of a data frame: +and to the end of the data frame: ```jldoctest dataframe julia> df[:, Cols(r"x", :)] 1×4 DataFrame From 54b7f5def6ea531fff8d6784a66cf460abde87ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 20 Sep 2021 19:14:50 +0200 Subject: [PATCH 7/9] Apply suggestions from code review --- docs/src/lib/internals.md | 1 - src/other/index.jl | 8 -------- 2 files changed, 9 deletions(-) diff --git a/docs/src/lib/internals.md b/docs/src/lib/internals.md index 326aaa8b67..125314a6c1 100644 --- a/docs/src/lib/internals.md +++ b/docs/src/lib/internals.md @@ -16,5 +16,4 @@ getmaxwidths ourshow ourstrwidth @spawn_for_chunks -Cols ``` diff --git a/src/other/index.jl b/src/other/index.jl index 547a1a3853..e7a5d260cd 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -225,14 +225,6 @@ end @inline Base.getindex(x::AbstractIndex, idx::Cols{<:Tuple{Function}}) = findall(idx.cols[1], names(x)) -""" - Cols(f::Function) - -Select the columns whose names passed to the `f` predicate as strings return `true`. -As a special case if `:` is passed (the `Colon` function) select all columns. -""" -Cols - @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer}) if any(v -> v isa Bool, idx) throw(ArgumentError("Bool values except for AbstractVector{Bool} are not " * From d56b3ea8a98018c68d071a3ceee793225a45b07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 20 Sep 2021 19:15:12 +0200 Subject: [PATCH 8/9] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 51031bd111..5ca253b7bf 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] CategoricalArrays = "0.10.0" Compat = "3.17" -DataAPI = "1.8" +DataAPI = "1.9" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2, 1" From 70deacd9b03037e98c02d2452253274b29e71c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 20 Sep 2021 19:15:53 +0200 Subject: [PATCH 9/9] Update NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 2844d0feaa..b67347f783 100644 --- a/NEWS.md +++ b/NEWS.md @@ -71,7 +71,7 @@ argument now allows `copycols` keyword argument ([#2859](https://github.com/JuliaData/DataFrames.jl/pull/2859)) * `Cols` now accepts a predicate accepting column names as strings. - ([#2880](https://github.com/JuliaData/DataFrames.jl/pull/2880)) + ([#2881](https://github.com/JuliaData/DataFrames.jl/pull/2881)) ## Bug fixes