Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add predicate support for names and more tests #2417

Merged
merged 12 commits into from
Jan 31, 2021
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
([#2573](https://github.com/JuliaData/DataFrames.jl/pull/2573))
* add `subset` and `subset!` functions that allow to subset rows
([#2496](https://github.com/JuliaData/DataFrames.jl/pull/2496))
* `names` now allows passing a predicate as a column selector
([#2417](https://github.com/JuliaData/DataFrames.jl/pull/2417))

## Deprecated

Expand Down
11 changes: 8 additions & 3 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,16 @@ Return a freshly allocated `Vector{String}` of names of columns contained in `df

If `cols` is passed then restrict returned column names to those matching the
selector (this is useful in particular with regular expressions, `Cols`, `Not`, and `Between`).
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR)
or a `Type`, in which case columns whose `eltype` is a subtype of `cols` are returned.
`cols` can be:
* any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR)
* a `Type`, in which case names of columns whose `eltype` is a subtype of `T`
are returned
* a `Function` predicate taking the column name as a string and returning `true`
for columns that should be kept

See also [`propertynames`](@ref) which returns a `Vector{Symbol}`.
"""
Base.names(df::AbstractDataFrame) = names(index(df))
Base.names(df::AbstractDataFrame, cols::Colon=:) = names(index(df))

function Base.names(df::AbstractDataFrame, cols)
nms = _names(index(df))
Expand All @@ -84,6 +88,7 @@ end

Base.names(df::AbstractDataFrame, T::Type) =
[String(n) for (n, c) in pairs(eachcol(df)) if eltype(c) <: T]
Base.names(df::AbstractDataFrame, fun::Function) = filter!(fun, names(df))

# _names returns Vector{Symbol} without copying
_names(df::AbstractDataFrame) = _names(index(df))
Expand Down
6 changes: 5 additions & 1 deletion src/dataframerow/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ Base.@propagate_inbounds Base.setindex!(r::DataFrameRow, value, idx) =

index(r::DataFrameRow) = getfield(r, :colindex)

Base.names(r::DataFrameRow) = names(index(r))
Base.names(r::DataFrameRow, cols::Colon=:) = names(index(r))

function Base.names(r::DataFrameRow, cols)
nms = _names(index(r))
Expand All @@ -272,6 +272,10 @@ function Base.names(r::DataFrameRow, cols)
return [string(nms[i]) for i in idxs]
end

Base.names(r::DataFrameRow, T::Type) =
[String(n) for n in _names(r) if eltype(parent(r)[!, n]) <: T]
Base.names(r::DataFrameRow, fun::Function) = filter!(fun, names(r))

_names(r::DataFrameRow) = view(_names(parent(r)), parentcols(index(r), :))

Base.haskey(r::DataFrameRow, key::Bool) =
Expand Down
6 changes: 3 additions & 3 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,9 @@ function Base.isequal(gd1::GroupedDataFrame, gd2::GroupedDataFrame)
all(x -> isequal(x...), zip(gd1, gd2))
end

Base.names(gd::GroupedDataFrame) = names(gd.parent)
Base.names(gd::GroupedDataFrame, cols) = names(gd.parent, cols)
_names(gd::GroupedDataFrame) = _names(gd.parent)
Base.names(gd::GroupedDataFrame) = names(parent(gd))
Base.names(gd::GroupedDataFrame, cols) = names(parent(gd), cols)
_names(gd::GroupedDataFrame) = _names(parent(gd))

function DataFrame(gd::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true)
if !copycols
Expand Down
38 changes: 28 additions & 10 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1925,16 +1925,34 @@ end
@test_throws ArgumentError push!(df, "a")
end

@testset "names for Type" begin
df = DataFrame(a1 = 1:3, a2 = [1, missing, 3],
b1 = 1.0:3.0, b2 = [1.0, missing, 3.0],
c1 = '1':'3', c2 = ['1', missing, '3'])
@test names(df, Int) == ["a1"]
@test names(df, Union{Missing, Int}) == ["a1", "a2"]
@test names(df, Real) == ["a1", "b1"]
@test names(df, Union{Missing, Real}) == ["a1", "a2", "b1", "b2"]
@test names(df, Any) == names(df)
@test names(df, Union{Char, Float64, Missing}) == ["b1", "b2", "c1", "c2"]
@testset "names for Type, predicate + standard tests of cols" begin
df_long = DataFrame(a1 = 1:3, a2 = [1, missing, 3],
b1 = 1.0:3.0, b2 = [1.0, missing, 3.0],
c1 = '1':'3', c2 = ['1', missing, '3'], x=1:3)
for x in (df_long[:, Not(end)], @view(df_long[:, Not(end)]),
groupby(df_long[:, Not(end)], :a1), groupby(@view(df_long[:, Not(end)]), :a1),
eachrow(df_long[:, Not(end)]), eachrow(@view(df_long[:, Not(end)])),
eachcol(df_long[:, Not(end)]), eachcol(@view(df_long[:, Not(end)])),
df_long[1, Not(end)])
@test names(x, 1) == ["a1"]
@test names(x, "a1") == ["a1"]
@test names(x, :a1) == ["a1"]
bkamins marked this conversation as resolved.
Show resolved Hide resolved
@test names(x, [2, 1]) == ["a2", "a1"]
@test names(x, ["a2", "a1"]) == ["a2", "a1"]
@test names(x, [:a2, :a1]) == ["a2", "a1"]
@test names(x, Int) == ["a1"]
@test names(x, Union{Missing, Int}) == ["a1", "a2"]
bkamins marked this conversation as resolved.
Show resolved Hide resolved
@test names(x, Real) == ["a1", "b1"]
@test names(x, Union{Missing, Real}) == ["a1", "a2", "b1", "b2"]
@test names(x, Any) == names(x)
@test isempty(names(x, BigInt))
@test names(x, Union{Char, Float64, Missing}) == ["b1", "b2", "c1", "c2"]
@test names(x, startswith("a")) == ["a1", "a2"]
@test names(x, :) == names(x)
@test names(x, <("a2")) == ["a1"]

@test_throws TypeError names(x, x -> 1)
end
end

end # module