Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make DataFrame constructor more flexible #2859

Merged
merged 5 commits into from
Sep 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@
* `leftjoin!` performing a left join of two data frame objects by updating the
left data frame with the joined columns from right data frame.
([#2843](https://github.com/JuliaData/DataFrames.jl/pull/2843))
* the `DataFrame` constructor when column names are passed to it as a second
argument now determines if a passed vector of column names is valid based on
its contents and not element type
([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2859))
* the `DataFrame` constructor when matrix is passed to it as a first
argument now allows `copycols` keyword argument
([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2859))

## Bug fixes

Expand Down
41 changes: 26 additions & 15 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ DataFrame(ds::AbstractDict; copycols::Bool=true)
DataFrame(kwargs..., copycols::Bool=true)

DataFrame(columns::AbstractVecOrMat,
names::AbstractVector{<:Union{AbstractVector, Symbol}};
names::AbstractVector;
makeunique::Bool=false, copycols::Bool=true)

DataFrame(table; copycols::Union{Bool, Nothing}=nothing)
Expand Down Expand Up @@ -56,7 +56,9 @@ the appropriate length. As a particular rule values stored in a `Ref` or a
It is also allowed to pass a vector of vectors or a matrix as as the first
argument. In this case the second argument must be
a vector of `Symbol`s or strings specifying column names, or the symbol `:auto`
to generate column names `x1`, `x2`, ... automatically.
to generate column names `x1`, `x2`, ... automatically. Note that in this case
if the first argument is a matrix and `copycols=false` the columns of the created
`DataFrame` will be views of columns the source matrix.

If a single positional argument is passed to a `DataFrame` constructor then it
is assumed to be of type that implements the
Expand Down Expand Up @@ -322,19 +324,26 @@ function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol};
copycols=copycols)
end

DataFrame(columns::AbstractVector, cnames::AbstractVector{<:AbstractString};
function _name2symbol(str::AbstractVector)
if !(all(x -> x isa AbstractString, str) || all(x -> x isa Symbol, str))
throw(ArgumentError("All passed column names must be strings or Symbols"))
end
return Symbol[Symbol(s) for s in str]
end

DataFrame(columns::AbstractVector, cnames::AbstractVector;
makeunique::Bool=false, copycols::Bool=true) =
DataFrame(columns, Symbol.(cnames), makeunique=makeunique, copycols=copycols)
DataFrame(columns, _name2symbol(cnames), makeunique=makeunique, copycols=copycols)

DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol};
makeunique::Bool=false, copycols::Bool=true)::DataFrame =
DataFrame(collect(AbstractVector, columns),
Index(convert(Vector{Symbol}, cnames), makeunique=makeunique),
copycols=copycols)

DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{<:AbstractString};
DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector;
makeunique::Bool=false, copycols::Bool=true) =
DataFrame(columns, Symbol.(cnames); makeunique=makeunique, copycols=copycols)
DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols)

function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true)
if cnames !== :auto
Expand All @@ -346,22 +355,25 @@ function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true)
return DataFrame(columns, gennames(length(columns)), copycols=copycols)
end

DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; makeunique::Bool=false) =
DataFrame(AbstractVector[columns[:, i] for i in 1:size(columns, 2)], cnames,
makeunique=makeunique, copycols=false)
function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol};
makeunique::Bool=false, copycols::Bool=true)
getter = copycols ? getindex : view
return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)],
cnames, makeunique=makeunique, copycols=false)
end

DataFrame(columns::AbstractMatrix, cnames::AbstractVector{<:AbstractString};
makeunique::Bool=false) =
DataFrame(columns, Symbol.(cnames); makeunique=makeunique)
DataFrame(columns::AbstractMatrix, cnames::AbstractVector;
makeunique::Bool=false, copycols::Bool=true) =
DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols)

function DataFrame(columns::AbstractMatrix, cnames::Symbol)
function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true)
if cnames !== :auto
throw(ArgumentError("if the first positional argument to DataFrame " *
"constructor is a matrix and a second " *
"positional argument is passed then the second " *
"argument must be a vector of column names or :auto"))
end
return DataFrame(columns, gennames(size(columns, 2)), makeunique=false)
return DataFrame(columns, gennames(size(columns, 2)), makeunique=false, copycols=copycols)
end

# Discontinued constructors
Expand All @@ -388,7 +400,6 @@ DataFrame(column_eltypes::AbstractVector{<:Type}, cnames::AbstractVector{<:Abstr
"not supported. Pass explicitly created columns to a " *
"`DataFrame` constructor instead."))


##############################################################################
##
## AbstractDataFrame interface
Expand Down
42 changes: 40 additions & 2 deletions test/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,6 @@ end
@test_throws ArgumentError DataFrame([1:3, 1], ["x1", "x2"], copycols=copycolsarg)
@test_throws ErrorException DataFrame([1:3, 1], copycols=copycolsarg)
end

@test_throws MethodError DataFrame([1 2; 3 4], :auto, copycols=false)
end

@testset "column types" begin
Expand Down Expand Up @@ -363,6 +361,8 @@ end
@test_throws ArgumentError DataFrame([Int, Float64], [:a, :b], 2)
@test_throws ArgumentError DataFrame([Int, Float64], ["a", "b"])
@test_throws ArgumentError DataFrame([Int, Float64], ["a", "b"], 2)
@test_throws ArgumentError DataFrame([Int, Float64], ["a", :b])
@test_throws MethodError DataFrame([Int, Float64], ["a", :b], 2)
end

@testset "threading correctness tests" begin
Expand All @@ -372,4 +372,42 @@ end
end
end

@testset "non-specific vector of column names" begin
ref = DataFrame(a=1:2, b=3:4)
for x in ([1 3; 2 4], [[1, 2], [3, 4]], [1:2, 3:4], Any[[1, 2], [3, 4]], Any[1:2, 3:4])
@test DataFrame(x, Any[:a, :b]) == ref
@test DataFrame(x, Any["a", "b"]) == ref
@test DataFrame(x, Union{String, Symbol}[:a, :b]) == ref
@test DataFrame(x, Union{String, Symbol}["a", "b"]) == ref
@test_throws ArgumentError DataFrame(x, Any["a", :b])
@test_throws ArgumentError DataFrame(x, Union{String, Symbol}["a", :b])
end
@test DataFrame([], []) == DataFrame()
@test DataFrame(fill(0, 0, 0), []) == DataFrame()
@test_throws ArgumentError DataFrame(Type[], Symbol[])
@test_throws ArgumentError DataFrame(Type[], String[])
@test_throws MethodError DataFrame(Type[], [])
end

@testset "DataFrame matrix constructor copycols kwarg" begin
m = [1 4; 2 5; 3 6]
refdf = DataFrame(x1=1:3, x2=4:6)
for cnames in ([:x1, :x2], ["x1", "x2"], Any[:x1, :x2], Any["x1", "x2"], :auto)
df = DataFrame(m, cnames)
@test df == refdf
@test df.x1 isa Vector{Int}
@test df.x2 isa Vector{Int}
df = DataFrame(m, cnames, copycols=true)
@test df == refdf
@test df.x1 isa Vector{Int}
@test df.x2 isa Vector{Int}
df = DataFrame(m, cnames, copycols=false)
@test df == refdf
@test df.x1 isa SubArray{Int, 1, Matrix{Int}}
@test df.x2 isa SubArray{Int, 1, Matrix{Int}}
@test parent(df.x1) === m
@test parent(df.x2) === m
end
end

end # module