From b81170cfa4922837ad6f82a659e55541c7c4b02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Sep 2021 23:57:09 +0200 Subject: [PATCH 1/4] make DataFrame constructor more flexible --- NEWS.md | 4 ++++ src/dataframe/dataframe.jl | 20 +++++++++++++------- test/constructors.jl | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 318e64dc99..e1c5b9cee7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -54,6 +54,10 @@ (with `false` default) that specifies if columns should be inserted after or before `col`. ([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2829)) +* the `DataFrame` constructor when column names are passed to it as a second + argument now determines if a passed vector of column names is valid based on + its contents and not element type + ([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2859)) ## Bug fixes diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index e5a3360c2e..46e6987a00 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -322,9 +322,16 @@ function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol}; copycols=copycols) end -DataFrame(columns::AbstractVector, cnames::AbstractVector{<:AbstractString}; +function _name2symbol(str::AbstractVector) + if !(all(x -> x isa AbstractString, str) || all(x -> x isa Symbol, str)) + throw(ArgumentError("All passed column names must be strings or Symbols")) + end + return Symbol[Symbol(s) for s in str] +end + +DataFrame(columns::AbstractVector, cnames::AbstractVector; makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, Symbol.(cnames), makeunique=makeunique, copycols=copycols) + DataFrame(columns, _name2symbol(cnames), makeunique=makeunique, copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol}; makeunique::Bool=false, copycols::Bool=true)::DataFrame = @@ -332,9 +339,9 @@ DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symb Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), copycols=copycols) -DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{<:AbstractString}; +DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector; makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, Symbol.(cnames); makeunique=makeunique, copycols=copycols) + DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -350,9 +357,9 @@ DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; makeunique::B DataFrame(AbstractVector[columns[:, i] for i in 1:size(columns, 2)], cnames, makeunique=makeunique, copycols=false) -DataFrame(columns::AbstractMatrix, cnames::AbstractVector{<:AbstractString}; +DataFrame(columns::AbstractMatrix, cnames::AbstractVector; makeunique::Bool=false) = - DataFrame(columns, Symbol.(cnames); makeunique=makeunique) + DataFrame(columns, _name2symbol(cnames); makeunique=makeunique) function DataFrame(columns::AbstractMatrix, cnames::Symbol) if cnames !== :auto @@ -388,7 +395,6 @@ DataFrame(column_eltypes::AbstractVector{<:Type}, cnames::AbstractVector{<:Abstr "not supported. Pass explicitly created columns to a " * "`DataFrame` constructor instead.")) - ############################################################################## ## ## AbstractDataFrame interface diff --git a/test/constructors.jl b/test/constructors.jl index 9a4b575eb5..b4b0a52406 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -363,6 +363,8 @@ end @test_throws ArgumentError DataFrame([Int, Float64], [:a, :b], 2) @test_throws ArgumentError DataFrame([Int, Float64], ["a", "b"]) @test_throws ArgumentError DataFrame([Int, Float64], ["a", "b"], 2) + @test_throws ArgumentError DataFrame([Int, Float64], ["a", :b]) + @test_throws MethodError DataFrame([Int, Float64], ["a", :b], 2) end @testset "threading correctness tests" begin @@ -372,4 +374,21 @@ end end end +@testset "non-specific vector of column names" begin + ref = DataFrame(a=1:2, b=3:4) + for x in ([1 3; 2 4], [[1, 2], [3, 4]], [1:2, 3:4], Any[[1, 2], [3, 4]], Any[1:2, 3:4]) + @test DataFrame(x, Any[:a, :b]) == ref + @test DataFrame(x, Any["a", "b"]) == ref + @test DataFrame(x, Union{String, Symbol}[:a, :b]) == ref + @test DataFrame(x, Union{String, Symbol}["a", "b"]) == ref + @test_throws ArgumentError DataFrame(x, Any["a", :b]) + @test_throws ArgumentError DataFrame(x, Union{String, Symbol}["a", :b]) + end + @test DataFrame([], []) == DataFrame() + @test DataFrame(fill(0, 0, 0), []) == DataFrame() + @test_throws ArgumentError DataFrame(Type[], Symbol[]) + @test_throws ArgumentError DataFrame(Type[], String[]) + @test_throws MethodError DataFrame(Type[], []) +end + end # module From 93b49e8ccfa98e782437b94367ecd760779c52ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 6 Sep 2021 18:17:40 +0200 Subject: [PATCH 2/4] allow copycols in matrix constructor --- NEWS.md | 3 +++ src/dataframe/dataframe.jl | 23 ++++++++++++++--------- test/constructors.jl | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index e1c5b9cee7..b5e2076854 100644 --- a/NEWS.md +++ b/NEWS.md @@ -58,6 +58,9 @@ argument now determines if a passed vector of column names is valid based on its contents and not element type ([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2859)) +* the `DataFrame` constructor when matrix is passed to it as a first + argument now allows `copycols` keyword argument + ([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2859)) ## Bug fixes diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 46e6987a00..5f9fb83ffa 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -14,7 +14,7 @@ DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(kwargs..., copycols::Bool=true) DataFrame(columns::AbstractVecOrMat, - names::AbstractVector{<:Union{AbstractVector, Symbol}}; + names::AbstractVector; makeunique::Bool=false, copycols::Bool=true) DataFrame(table; copycols::Union{Bool, Nothing}=nothing) @@ -56,7 +56,9 @@ the appropriate length. As a particular rule values stored in a `Ref` or a It is also allowed to pass a vector of vectors or a matrix as as the first argument. In this case the second argument must be a vector of `Symbol`s or strings specifying column names, or the symbol `:auto` -to generate column names `x1`, `x2`, ... automatically. +to generate column names `x1`, `x2`, ... automatically. Note that in this case +if the first argument is a matrix and `copycols=false` the columns of the created +`DataFrame` will be views of columns the source matrix. If a single positional argument is passed to a `DataFrame` constructor then it is assumed to be of type that implements the @@ -353,22 +355,25 @@ function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) return DataFrame(columns, gennames(length(columns)), copycols=copycols) end -DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; makeunique::Bool=false) = - DataFrame(AbstractVector[columns[:, i] for i in 1:size(columns, 2)], cnames, - makeunique=makeunique, copycols=false) +function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; + makeunique::Bool=false, copycols::Bool=true) + getter = copycols ? getindex : view + return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)], + cnames, makeunique=makeunique, copycols=false) +end DataFrame(columns::AbstractMatrix, cnames::AbstractVector; - makeunique::Bool=false) = - DataFrame(columns, _name2symbol(cnames); makeunique=makeunique) + makeunique::Bool=false, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) -function DataFrame(columns::AbstractMatrix, cnames::Symbol) +function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) if cnames !== :auto throw(ArgumentError("if the first positional argument to DataFrame " * "constructor is a matrix and a second " * "positional argument is passed then the second " * "argument must be a vector of column names or :auto")) end - return DataFrame(columns, gennames(size(columns, 2)), makeunique=false) + return DataFrame(columns, gennames(size(columns, 2)), makeunique=false, copycols=copycols) end # Discontinued constructors diff --git a/test/constructors.jl b/test/constructors.jl index b4b0a52406..fb4e2278fa 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -391,4 +391,25 @@ end @test_throws MethodError DataFrame(Type[], []) end +@testset "DataFrame matrix constructor copycols kwarg" begin + m = [1 4; 2 5; 3 6] + refdf = DataFrame(x1=1:3, x2=4:6) + for cnames in ([:x1, :x2], ["x1", "x2"], Any[:x1, :x2], Any["x1", "x2"], :auto) + df = DataFrame(m, cnames) + @test df == refdf + @test df.x1 isa Vector{Int} + @test df.x2 isa Vector{Int} + df = DataFrame(m, cnames, copycols=true) + @test df == refdf + @test df.x1 isa Vector{Int} + @test df.x2 isa Vector{Int} + df = DataFrame(m, cnames, copycols=false) + @test df == refdf + @test df.x1 isa SubArray{Int,1, Matrix{Int}} + @test df.x2 isa SubArray{Int,1, Matrix{Int}} + @test parent(df.x1) === m + @test parent(df.x2) === m + end +end + end # module From 0c47dd7ce379d0f78831d2a109af7cd4cb3bbe33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 6 Sep 2021 19:39:17 +0200 Subject: [PATCH 3/4] fix old test --- test/constructors.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/constructors.jl b/test/constructors.jl index fb4e2278fa..fc32c78e63 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -315,8 +315,6 @@ end @test_throws ArgumentError DataFrame([1:3, 1], ["x1", "x2"], copycols=copycolsarg) @test_throws ErrorException DataFrame([1:3, 1], copycols=copycolsarg) end - - @test_throws MethodError DataFrame([1 2; 3 4], :auto, copycols=false) end @testset "column types" begin From 1ccaefacccf75689d1abdd4be0ac4843a3f275f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 6 Sep 2021 21:25:34 +0200 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/dataframe/dataframe.jl | 2 +- test/constructors.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 5f9fb83ffa..2e5f137de0 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -359,7 +359,7 @@ function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; makeunique::Bool=false, copycols::Bool=true) getter = copycols ? getindex : view return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)], - cnames, makeunique=makeunique, copycols=false) + cnames, makeunique=makeunique, copycols=false) end DataFrame(columns::AbstractMatrix, cnames::AbstractVector; diff --git a/test/constructors.jl b/test/constructors.jl index fc32c78e63..521c07baea 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -403,8 +403,8 @@ end @test df.x2 isa Vector{Int} df = DataFrame(m, cnames, copycols=false) @test df == refdf - @test df.x1 isa SubArray{Int,1, Matrix{Int}} - @test df.x2 isa SubArray{Int,1, Matrix{Int}} + @test df.x1 isa SubArray{Int, 1, Matrix{Int}} + @test df.x2 isa SubArray{Int, 1, Matrix{Int}} @test parent(df.x1) === m @test parent(df.x2) === m end