diff --git a/NEWS.md b/NEWS.md index a819f07f29..b67347f783 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,6 +70,8 @@ * the `DataFrame` constructor when matrix is passed to it as a first argument now allows `copycols` keyword argument ([#2859](https://github.com/JuliaData/DataFrames.jl/pull/2859)) +* `Cols` now accepts a predicate accepting column names as strings. + ([#2881](https://github.com/JuliaData/DataFrames.jl/pull/2881)) ## Bug fixes diff --git a/Project.toml b/Project.toml index 51031bd111..5ca253b7bf 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] CategoricalArrays = "0.10.0" Compat = "3.17" -DataAPI = "1.8" +DataAPI = "1.9" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2, 1" diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 386ab2150c..1f49e2f0d5 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -26,7 +26,17 @@ The rules for a valid type of index into a column are the following: * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`; * a regular expression, which gets expanded to a vector of matching column names; * a `Not` expression (see [InvertedIndices.jl](https://github.com/mbauman/InvertedIndices.jl)); - * an `Cols`, `All` or `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + the `Not(idx)` selects all indices not in the passed `idx`; + * a `Cols` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + `Cols(idxs...)` selects the union of the selections in `idxs`; in particular + `Cols()` selects no columns and `Cols(:)` selects all columns; a special rule is + `Cols(predicate)`, where `predicate` is a predicate function; in this case + the columns whose names passed to `predicate` as strings return `true` + are selected. + * a `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + `Between(first, last)` selects the columns between `first` and `last`; + * an `All` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl)); + `All()` selects all columns, equivalent to `:`; * a colon literal `:`. The rules for a valid type of index into a row are the following: diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index 6001ace2af..b7abc2d683 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -255,11 +255,9 @@ julia> df[!, Not(:x1)] Finally, you can use `Not`, `Between`, `Cols` and `All` selectors in more complex column selection scenarios (note that `Cols()` selects no columns while `All()` selects all columns therefore `Cols` is a preferred selector if you -write generic code). The following examples move all columns whose names match -`r"x"` regular expression respectively to the front and to the end of a data -frame: +write generic code). Here are examples of using each of these selectors: -``` +```jldoctest dataframe julia> df = DataFrame(r=1, x1=2, x2=3, y=4) 1×4 DataFrame Row │ r x1 x2 y @@ -267,6 +265,39 @@ julia> df = DataFrame(r=1, x1=2, x2=3, y=4) ─────┼──────────────────────────── 1 │ 1 2 3 4 +julia> df[:, Not(:r)] # drop :r column +1×3 DataFrame + Row │ x1 x2 y + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 2 3 4 + +julia> df[:, Between(:r, :x2)] # keep columns between :r and :x2 +1×3 DataFrame + Row │ r x1 x2 + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 2 3 + +julia> df[:, All()] # keep all columns +1×4 DataFrame + Row │ r x1 x2 y + │ Int64 Int64 Int64 Int64 +─────┼──────────────────────────── + 1 │ 1 2 3 4 + +julia> df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x" +1×2 DataFrame + Row │ x1 x2 + │ Int64 Int64 +─────┼────────────── + 1 │ 2 3 +``` + +The following examples show a more complex use of the `Cols` selector, which moves all +columns whose names match `r"x"` regular expression respectively to the front +and to the end of the data frame: +```jldoctest dataframe julia> df[:, Cols(r"x", :)] 1×4 DataFrame Row │ x1 x2 r y diff --git a/src/other/index.jl b/src/other/index.jl index 3110a8af5a..e7a5d260cd 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -221,6 +221,9 @@ end isempty(idx.cols) ? (1:length(x)) : throw(ArgumentError("All(args...) is not supported: use Cols(args...) instead")) @inline Base.getindex(x::AbstractIndex, idx::Cols) = isempty(idx.cols) ? Int[] : union(getindex.(Ref(x), idx.cols)...) +@inline Base.getindex(x::AbstractIndex, idx::Cols{Tuple{typeof(:)}}) = x[:] +@inline Base.getindex(x::AbstractIndex, idx::Cols{<:Tuple{Function}}) = + findall(idx.cols[1], names(x)) @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer}) if any(v -> v isa Bool, idx) diff --git a/test/index.jl b/test/index.jl index 5dbf1d5b17..6d4275a942 100644 --- a/test/index.jl +++ b/test/index.jl @@ -474,6 +474,11 @@ end df = DataFrame(a1=1, a2=2, b1=3, b2=4) @test df[:, Cols(r"a", Not(r"1"))] == df[:, [1, 2, 4]] @test df[:, Cols(Not(r"1"), r"a")] == df[:, [2, 4, 1]] + @test df[:, Cols(x -> x[1] == 'a')] == df[:, [1, 2]] + @test df[:, Cols(x -> x[end] == '1')] == df[:, [1, 3]] + @test df[:, Cols(x -> x[end] == '3')] == DataFrame() + @test_throws MethodError df[:, Cols(x -> true, 1)] == DataFrame() + @test_throws MethodError df[:, Cols(1, x -> true)] == DataFrame() end @testset "views" begin