diff --git a/NEWS.md b/NEWS.md index 65c4daa400..ddbf7204cc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,8 @@ ([#2573](https://github.com/JuliaData/DataFrames.jl/pull/2573)) * add `subset` and `subset!` functions that allow to subset rows ([#2496](https://github.com/JuliaData/DataFrames.jl/pull/2496)) +* `names` now allows passing a predicate as a column selector + ([#2417](https://github.com/JuliaData/DataFrames.jl/pull/2417)) ## Deprecated diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 75703ca840..df21c74683 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -68,12 +68,16 @@ Return a freshly allocated `Vector{String}` of names of columns contained in `df If `cols` is passed then restrict returned column names to those matching the selector (this is useful in particular with regular expressions, `Cols`, `Not`, and `Between`). -`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR) -or a `Type`, in which case columns whose `eltype` is a subtype of `cols` are returned. +`cols` can be: +* any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR) +* a `Type`, in which case names of columns whose `eltype` is a subtype of `T` + are returned +* a `Function` predicate taking the column name as a string and returning `true` + for columns that should be kept See also [`propertynames`](@ref) which returns a `Vector{Symbol}`. """ -Base.names(df::AbstractDataFrame) = names(index(df)) +Base.names(df::AbstractDataFrame, cols::Colon=:) = names(index(df)) function Base.names(df::AbstractDataFrame, cols) nms = _names(index(df)) @@ -84,6 +88,7 @@ end Base.names(df::AbstractDataFrame, T::Type) = [String(n) for (n, c) in pairs(eachcol(df)) if eltype(c) <: T] +Base.names(df::AbstractDataFrame, fun::Function) = filter!(fun, names(df)) # _names returns Vector{Symbol} without copying _names(df::AbstractDataFrame) = _names(index(df)) diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl index 6deca3b86c..7cf062b37a 100644 --- a/src/dataframerow/dataframerow.jl +++ b/src/dataframerow/dataframerow.jl @@ -263,7 +263,7 @@ Base.@propagate_inbounds Base.setindex!(r::DataFrameRow, value, idx) = index(r::DataFrameRow) = getfield(r, :colindex) -Base.names(r::DataFrameRow) = names(index(r)) +Base.names(r::DataFrameRow, cols::Colon=:) = names(index(r)) function Base.names(r::DataFrameRow, cols) nms = _names(index(r)) @@ -272,6 +272,10 @@ function Base.names(r::DataFrameRow, cols) return [string(nms[i]) for i in idxs] end +Base.names(r::DataFrameRow, T::Type) = + [String(n) for n in _names(r) if eltype(parent(r)[!, n]) <: T] +Base.names(r::DataFrameRow, fun::Function) = filter!(fun, names(r)) + _names(r::DataFrameRow) = view(_names(parent(r)), parentcols(index(r), :)) Base.haskey(r::DataFrameRow, key::Bool) = diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index 293e4f0b2e..9e416464f6 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -282,9 +282,9 @@ function Base.isequal(gd1::GroupedDataFrame, gd2::GroupedDataFrame) all(x -> isequal(x...), zip(gd1, gd2)) end -Base.names(gd::GroupedDataFrame) = names(gd.parent) -Base.names(gd::GroupedDataFrame, cols) = names(gd.parent, cols) -_names(gd::GroupedDataFrame) = _names(gd.parent) +Base.names(gd::GroupedDataFrame) = names(parent(gd)) +Base.names(gd::GroupedDataFrame, cols) = names(parent(gd), cols) +_names(gd::GroupedDataFrame) = _names(parent(gd)) function DataFrame(gd::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) if !copycols diff --git a/test/dataframe.jl b/test/dataframe.jl index d0688bafd1..27f78d4d30 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1925,16 +1925,34 @@ end @test_throws ArgumentError push!(df, "a") end -@testset "names for Type" begin - df = DataFrame(a1 = 1:3, a2 = [1, missing, 3], - b1 = 1.0:3.0, b2 = [1.0, missing, 3.0], - c1 = '1':'3', c2 = ['1', missing, '3']) - @test names(df, Int) == ["a1"] - @test names(df, Union{Missing, Int}) == ["a1", "a2"] - @test names(df, Real) == ["a1", "b1"] - @test names(df, Union{Missing, Real}) == ["a1", "a2", "b1", "b2"] - @test names(df, Any) == names(df) - @test names(df, Union{Char, Float64, Missing}) == ["b1", "b2", "c1", "c2"] +@testset "names for Type, predicate + standard tests of cols" begin + df_long = DataFrame(a1 = 1:3, a2 = [1, missing, 3], + b1 = 1.0:3.0, b2 = [1.0, missing, 3.0], + c1 = '1':'3', c2 = ['1', missing, '3'], x=1:3) + for x in (df_long[:, Not(end)], @view(df_long[:, Not(end)]), + groupby(df_long[:, Not(end)], :a1), groupby(@view(df_long[:, Not(end)]), :a1), + eachrow(df_long[:, Not(end)]), eachrow(@view(df_long[:, Not(end)])), + eachcol(df_long[:, Not(end)]), eachcol(@view(df_long[:, Not(end)])), + df_long[1, Not(end)]) + @test names(x, 1) == ["a1"] + @test names(x, "a1") == ["a1"] + @test names(x, :a1) == ["a1"] + @test names(x, [2, 1]) == ["a2", "a1"] + @test names(x, ["a2", "a1"]) == ["a2", "a1"] + @test names(x, [:a2, :a1]) == ["a2", "a1"] + @test names(x, Int) == ["a1"] + @test names(x, Union{Missing, Int}) == ["a1", "a2"] + @test names(x, Real) == ["a1", "b1"] + @test names(x, Union{Missing, Real}) == ["a1", "a2", "b1", "b2"] + @test names(x, Any) == names(x) + @test isempty(names(x, BigInt)) + @test names(x, Union{Char, Float64, Missing}) == ["b1", "b2", "c1", "c2"] + @test names(x, startswith("a")) == ["a1", "a2"] + @test names(x, :) == names(x) + @test names(x, <("a2")) == ["a1"] + + @test_throws TypeError names(x, x -> 1) + end end end # module