diff --git a/NEWS.md b/NEWS.md index 14b1f1fd1..1d7fd3050 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,10 +16,35 @@ ## Bug fixes +* Correctly throw an error if negative number of rows is passed + to `first` or `last` + ([#3402](https://github.com/JuliaData/DataFrames.jl/pull/3402)) * Always use the default thread pool for multithreaded operations, instead of using the interactive thread pool when Julia was started with `-tM,N` with N > 0 ([#3385](https://github.com/JuliaData/DataFrames.jl/pull/3385)) +* Correctly return `Bool[]` in the `nonunique` function applied to a data frame + with a pulled column that has zero levels in the pool + ([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393)) +* Correctly index `eachrow` and `eachcol` with `CartesianIndex` + ([#3413](https://github.com/JuliaData/DataFrames.jl/issues/3413)) +* Correctly handle non-standard integers when converting them to `BigInt` + ([#3419](https://github.com/JuliaData/DataFrames.jl/issues/3419)) + +## Removed deprecations + +* The `by` and `aggregate` functions that were deprecated before 1.0 + release are now removed. + ([#3422](https://github.com/JuliaData/DataFrames.jl/issues/3422)) + +## Julia compatibility change + +* Ensure that `allunique(::AbstractDataFrame, ::Any)` always gets + interpreted as test for uniqueness of rows in the first positional argument + ([#3434](https://github.com/JuliaData/DataFrames.jl/issues/3434)) +* Make sure that an empty vector of `Any` or of `AbstractVector` is treated as having + no columns when a data frame is being processed with `combine`/`select`/`transform`. + ([#3435](https://github.com/JuliaData/DataFrames.jl/issues/3435)) # DataFrames.jl v1.6.1 Release Notes diff --git a/Project.toml b/Project.toml index df31d9fb6..f76100bdf 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" CategoricalArrays = "0.10.0" Combinatorics = "1.0.2" Compat = "4.2" -DataAPI = "1.15.0" +DataAPI = "1.16.0" DataStructures = "0.18" DataValues = "0.4.13" InlineStrings = "1.3.0" diff --git a/docs/Project.toml b/docs/Project.toml index ebe348a76..f6a9f940e 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -11,4 +11,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Documenter = "0.27" +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index fa64782da..a5d5a4f4e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,7 +16,8 @@ makedocs( format = Documenter.HTML( canonical = "https://juliadata.github.io/DataFrames.jl/stable/", assets = ["assets/favicon.ico"], - edit_link = "main" + edit_link = "main", + size_threshold_ignore = ["man/basics.md", "lib/functions.md"], ), pages = Any[ "Introduction" => "index.md", @@ -42,11 +43,10 @@ makedocs( hide("Internals" => "lib/internals.md"), ] ], - strict = true ) -# Deploy built documentation from Travis. -# ======================================= +# Deploy built documentation. +# =========================== deploydocs( # options diff --git a/docs/src/index.md b/docs/src/index.md index 66ed6f3e5..64a943a06 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -12,12 +12,13 @@ other packages you can check-out the following resources * [Data Wrangling with DataFrames.jl Cheat Sheet](https://www.ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/) * [DataFrames Tutorial using Jupyter Notebooks](https://github.com/bkamins/Julia-DataFrames-Tutorial/) * [Julia Academy DataFrames.jl tutorial](https://github.com/JuliaAcademy/DataFrames) -* [JuliaCon 2019](https://github.com/bkamins/JuliaCon2019-DataFrames-Tutorial), - [JuliaCon 2020](https://github.com/bkamins/JuliaCon2020-DataFrames-Tutorial), - [JuliaCon 2021](https://github.com/bkamins/JuliaCon2021-DataFrames-Tutorial), +* [JuliaCon 2023](https://github.com/bkamins/JuliaCon2023-Tutorial), [JuliaCon 2022](https://github.com/bkamins/JuliaCon2022-DataFrames-Tutorial), - [PyData Global 2020](https://github.com/bkamins/PyDataGlobal2020), - and [ODSC Europe 2021](https://github.com/bkamins/ODSC-EUROPE-2021) tutorials + [JuliaCon 2021](https://github.com/bkamins/JuliaCon2021-DataFrames-Tutorial), + [JuliaCon 2020](https://github.com/bkamins/JuliaCon2020-DataFrames-Tutorial), + [JuliaCon 2019](https://github.com/bkamins/JuliaCon2019-DataFrames-Tutorial), + [ODSC Europe 2021](https://github.com/bkamins/ODSC-EUROPE-2021) tutorials, + and [PyData Global 2020](https://github.com/bkamins/PyDataGlobal2020) * [DataFrames.jl showcase](https://github.com/bkamins/DataFrames-Showcase) If you prefer to learn DataFrames.jl from a book you can consider reading: diff --git a/docs/src/man/basics.md b/docs/src/man/basics.md index 6f2427c56..d51038c2d 100644 --- a/docs/src/man/basics.md +++ b/docs/src/man/basics.md @@ -175,6 +175,40 @@ julia> DataFrame([(a=1, b=0), (a=2, b=0)]) 2 │ 2 0 ``` +Sometimes your source data might have a heterogeneous set of columns for each observation. +Here is an example: + +``` +julia> source = [(type="circle", radius=10), (type="square", side=20)] +2-element Vector{NamedTuple{names, Tuple{String, Int64}} where names}: + (type = "circle", radius = 10) + (type = "square", side = 20) +``` + +If you want to create a data frame from such data containing all columns present in at least +one of the source observations, with a `missing` entry if some column is not present then +you can use `Tables.dictcolumntable` function to help you create the desired data frame: + +``` +julia> DataFrame(Tables.dictcolumntable(source)) +2×3 DataFrame + Row │ type radius side + │ String Int64? Int64? +─────┼────────────────────────── + 1 │ circle 10 missing + 2 │ square missing 20 +``` + +The role of `Tables.dictcolumntable` is to make sure that the `DataFrame` constructor gets information +about all columns present in the source data and properly instantiates them. If we did not use +this function the `DataFrame` constructor would assume that the first row of data contains the set +of columns present in the source, which would lead to an error in our example: + +``` +julia> DataFrame(source) +ERROR: type NamedTuple has no field radius +``` + Let us finish our review of constructors by showing how to create a `DataFrame` from a matrix. In this case you pass a matrix as a first argument. If the second argument is just `:auto` then column names `x1`, `x2`, ... will be auto generated. diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index 7e1194f56..b13b6b1ef 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -443,7 +443,7 @@ A particular common case of a collection that supports the a vector of `NamedTuple`s: ```jldoctest dataframe julia> v = [(a=1, b=2), (a=3, b=4)] -2-element Vector{NamedTuple{(:a, :b), Tuple{Int64, Int64}}}: +2-element Vector{@NamedTuple{a::Int64, b::Int64}}: (a = 1, b = 2) (a = 3, b = 4) @@ -460,7 +460,7 @@ You can also easily convert a data frame back to a vector of `NamedTuple`s: julia> using Tables julia> Tables.rowtable(df) -2-element Vector{NamedTuple{(:a, :b), Tuple{Int64, Int64}}}: +2-element Vector{@NamedTuple{a::Int64, b::Int64}}: (a = 1, b = 2) (a = 3, b = 4) ``` diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md index 47799c5d5..abda7ec6f 100644 --- a/docs/src/man/querying_frameworks.md +++ b/docs/src/man/querying_frameworks.md @@ -5,6 +5,9 @@ DataFramesMeta.jl, DataFrameMacros.jl and Query.jl. They implement a functionali [dplyr](https://dplyr.tidyverse.org/) or [LINQ](https://en.wikipedia.org/wiki/Language_Integrated_Query). +These frameworks are designed both to make it easier for new users to start working with data frames in Julia +and to allow advanced users to write more compact code. + ## DataFramesMeta.jl The [DataFramesMeta.jl](https://github.com/JuliaStats/DataFramesMeta.jl) package @@ -30,7 +33,7 @@ pipe the output of one transformation as an input to another, as with Below we present several selected examples of usage of the package. First we subset rows of the source data frame using a logical condition -and select its two columns, renaming one of them: +and select two of its columns, renaming one of them: ```jldoctest dataframesmeta julia> using DataFramesMeta diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 57809cbdb..debd309f5 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -20,6 +20,7 @@ import DataAPI, DataAPI.Between, DataAPI.Cols, DataAPI.describe, + DataAPI.groupby, DataAPI.innerjoin, DataAPI.outerjoin, DataAPI.rightjoin, diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a812365ee..e8f4e32ed 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -558,14 +558,19 @@ Base.first(df::AbstractDataFrame) = df[1, :] first(df::AbstractDataFrame, n::Integer; view::Bool=false) Get a data frame with the `n` first rows of `df`. +Get all rows if `n` is greater than the number of rows in `df`. +Error if `n` is negative. If `view=false` a freshly allocated `DataFrame` is returned. If `view=true` then a `SubDataFrame` view into `df` is returned. $METADATA_FIXED """ -@inline Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) = - view ? Base.view(df, 1:min(n ,nrow(df)), :) : df[1:min(n, nrow(df)), :] +@inline function Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) + n < 0 && throw(ArgumentError("Number of elements must be nonnegative")) + r = min(n, nrow(df)) + return view ? Base.view(df, 1:r, :) : df[1:r, :] +end """ last(df::AbstractDataFrame) @@ -580,14 +585,19 @@ Base.last(df::AbstractDataFrame) = df[nrow(df), :] last(df::AbstractDataFrame, n::Integer; view::Bool=false) Get a data frame with the `n` last rows of `df`. +Get all rows if `n` is greater than the number of rows in `df`. +Error if `n` is negative. If `view=false` a freshly allocated `DataFrame` is returned. If `view=true` then a `SubDataFrame` view into `df` is returned. $METADATA_FIXED """ -@inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) = - view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :] +@inline function Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) + n < 0 && throw(ArgumentError("Number of elements must be nonnegative")) + r = max(1, nrow(df) - n + 1) + return view ? Base.view(df, r:nrow(df), :) : df[r:nrow(df), :] +end """ describe(df::AbstractDataFrame; cols=:) @@ -1476,7 +1486,7 @@ function fillcombinations(df::AbstractDataFrame, indexcols; end # make sure we do not overflow in the target data frame size - target_rows = Int(prod(x -> big(length(x)), uniquevals)) + target_rows = Int(prod(x -> BigInt(length(x)), uniquevals)) if iszero(target_rows) @assert iszero(nrow(df)) cdf = copy(df) diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index c81228fb1..22589d3c8 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -57,7 +57,7 @@ julia> eachrow(df) 4 │ 4 14 julia> copy.(eachrow(df)) -4-element Vector{NamedTuple{(:x, :y), Tuple{Int64, Int64}}}: +4-element Vector{@NamedTuple{x::Int64, y::Int64}}: (x = 1, y = 11) (x = 2, y = 12) (x = 3, y = 13) @@ -81,6 +81,7 @@ Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear() Base.size(itr::DataFrameRows) = (size(parent(itr), 1), ) Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::Int) = parent(itr)[i, :] +Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::CartesianIndex{1}) = itr[i[1]] Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, idx) = eachrow(@view parent(itr)[idx isa AbstractVector && !(eltype(idx) <: Bool) ? copy(idx) : idx, :]) @@ -263,6 +264,8 @@ Base.iterate(itr::DataFrameColumns, i::Integer=1) = i <= length(itr) ? (itr[i], i + 1) : nothing Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::ColumnIndex) = parent(itr)[!, idx] +Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::CartesianIndex{1}) = + itr[idx[1]] Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::MultiColumnIndex) = eachcol(parent(itr)[!, idx]) Base.:(==)(itr1::DataFrameColumns, itr2::DataFrameColumns) = diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index a90f1203d..81dae2da9 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -822,7 +822,12 @@ function select_transform!((nc,)::Ref{Any}, df::AbstractDataFrame, newdf::DataFr res = newres elseif !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix, Tables.AbstractRow}) - res = Tables.columntable(res) + if res isa Union{AbstractVector{Any}, AbstractVector{<:AbstractVector}} + @assert isempty(res) + res = DataFrame() + else + res = Tables.columntable(res) + end end end diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index fd94caac6..9e91d6a53 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -87,7 +87,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) if !(keep in (:first, :last, :noduplicates)) throw(ArgumentError("`keep` must be :first, :last, or :noduplicates")) end - ncol(df) == 0 && return Bool[] + nrow(df) == 0 && return Bool[] res = fill(true, nrow(df)) cols = ntuple(i -> df[!, i], ncol(df)) if keep == :first @@ -207,6 +207,11 @@ function Base.allunique(df::AbstractDataFrame, cols=:) Val(false), nothing, false, nothing, true)[1] == nrow(df) end +# avoid invoking Base.allunique(f, iterator) introduced in Julia 1.11 + +Base.allunique(df::AbstractDataFrame, cols::Tuple) = + invoke(Base.allunique, Tuple{AbstractDataFrame, Any}, df, cols) + """ unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first) unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3f4afafec..b9a496ba9 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1546,7 +1546,7 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) @assert length(colvalues) == length(colnames) @assert all(x -> x isa AbstractVector, colvalues) - target_rows = Int(prod(x -> big(length(x)), colvalues)) + target_rows = Int(prod(x -> BigInt(length(x)), colvalues)) out_df = DataFrame() inner = 1 for (val, cname) in zip(colvalues, colnames) @@ -1563,4 +1563,3 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) end _try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false) - diff --git a/src/deprecated.jl b/src/deprecated.jl index 19839c54a..c93bc70d5 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -1,11 +1,3 @@ -export by, aggregate - -# TODO: remove definitions in 2.0 release -by(args...; kwargs...) = throw(ArgumentError("by function was removed from DataFrames.jl. " * - "Use the `combine(groupby(...), ...)` or `combine(f, groupby(...))` instead.")) -aggregate(args...; kwargs...) = throw(ArgumentError("aggregate function was removed from DataFrames.jl. " * - "Use the `combine` function instead.")) - # TODO: remove deprecation in 2.0 release import Base.delete! @deprecate delete!(df::DataFrame, inds) deleteat!(df::DataFrame, inds) \ No newline at end of file diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 1a58b7ec4..c73e0730c 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -156,7 +156,7 @@ function refpool_and_array(x::AbstractArray) else minval, maxval = extrema(x) end - ngroups = big(maxval) - big(minval) + 1 + ngroups = BigInt(maxval) - BigInt(minval) + 1 # Threshold chosen with the same rationale as the row_group_slots! refpool method: # refpool approach is faster but we should not allocate too much memory either # We also have to avoid overflow, including with ngroups + 1 for missing values @@ -337,7 +337,11 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, nt = max(1, lg ÷ 100_000) end # if there are few rows per group limit the number of threads used - nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt) + if ngroups == 0 + nt = 1 + else + nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt) + end seen = fill(false, ngroups) seen_vec = Vector{Vector{Bool}}(undef, nt) diff --git a/src/join/core.jl b/src/join/core.jl index 87d94d8fe..89f89c989 100644 --- a/src/join/core.jl +++ b/src/join/core.jl @@ -328,7 +328,7 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} right::AbstractVector{<:Union{Integer, Missing}}) minv, maxv = extrema_missing(right) - val_range = big(maxv) - big(minv) + val_range = BigInt(maxv) - BigInt(minv) if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) || minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _innerjoin_unsorted(left, right) @@ -648,7 +648,7 @@ function _semijoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}}, right_shorter::Bool) minv, maxv = extrema_missing(right) - val_range = big(maxv) - big(minv) + val_range = BigInt(maxv) - BigInt(minv) if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) || minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _semijoin_unsorted(left, right, seen_rows, right_shorter) diff --git a/test/dataframe.jl b/test/dataframe.jl index fbc2ec0ca..940590852 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1180,10 +1180,16 @@ end @test_throws BoundsError first(DataFrame(x=[])) @test_throws BoundsError last(DataFrame(x=[])) - @test first(df, 6) == DataFrame(A=1:6) - @test first(df, 1) == DataFrame(A=1) - @test last(df, 6) == DataFrame(A=5:10) - @test last(df, 1) == DataFrame(A=10) + for v in (true, false) + @test first(df, 6, view=v) == DataFrame(A=1:6) + @test first(df, 1, view=v) == DataFrame(A=1) + @test first(df, 0, view=v) == DataFrame(A=Int[]) + @test_throws ArgumentError first(df, -1, view=v) + @test last(df, 6, view=v) == DataFrame(A=5:10) + @test last(df, 1, view=v) == DataFrame(A=10) + @test last(df, 0, view=v) == DataFrame(A=Int[]) + @test_throws ArgumentError last(df, -1, view=v) + end @inferred first(df, 6) @inferred last(df, 6) @@ -2325,6 +2331,7 @@ end @test allunique(df, []) @test allunique(df, x -> 1:4) @test allunique(df, [:a, :b] => ByRow(string)) + @test_throws ArgumentError allunique(df, ()) end end diff --git a/test/deprecated.jl b/test/deprecated.jl index beaba2770..7a09015b7 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -4,11 +4,6 @@ using Test, DataFrames, CategoricalArrays const ≅ = isequal -@testset "by and aggregate" begin - @test_throws ArgumentError by() - @test_throws ArgumentError aggregate() -end - @testset "indicator in joins" begin name = DataFrame(ID=[1, 2, 3], Name=["John Doe", "Jane Doe", "Joe Blogs"]) job = DataFrame(ID=[1, 2, 4], Job=["Lawyer", "Doctor", "Farmer"]) diff --git a/test/duplicates.jl b/test/duplicates.jl index 61c01874d..d8c264962 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -1,6 +1,6 @@ module TestDuplicates -using Test, DataFrames, CategoricalArrays, Random +using Test, DataFrames, CategoricalArrays, Random, PooledArrays const ≅ = isequal @testset "nonunique" begin @@ -30,8 +30,8 @@ const ≅ = isequal @test_throws ArgumentError unique!(df) @test_throws ArgumentError unique(df, true) - pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]), - b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :) + pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]), + b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :) updf = DataFrame(a=CategoricalArray(["a", "a", missing, "b", missing]), b=CategoricalArray(["a", "b", missing, "b", "a"])) @test nonunique(pdf) == [false, false, false, true, false, false, true, true] @@ -39,6 +39,9 @@ const ≅ = isequal @test updf ≅ unique(pdf) @test_throws ArgumentError unique!(pdf) @test_throws ArgumentError unique(pdf, true) + + @test isempty(nonunique(DataFrame(a=PooledArray(Int[])))) + @test typeof(nonunique(DataFrame(a=PooledArray(Int[])))) === Vector{Bool} end @testset "nonunique, nonunique, unique! with extra argument" begin diff --git a/test/grouping.jl b/test/grouping.jl index df9e79bd3..a1283c656 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4531,4 +4531,9 @@ end end end +@testset "no levels in pooled grouping bug #3393" begin + @test isempty(groupby_checked(DataFrame(x=PooledArray([missing])), :x, skipmissing=true)) + @test isempty(groupby_checked(DataFrame(x=categorical([missing])), :x, skipmissing=true)) +end + end # module diff --git a/test/iteration.jl b/test/iteration.jl index 4c1b9d0d1..249677a02 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -15,6 +15,8 @@ using Test, DataFrames @test sprint(summary, eachrow(df)) == "2-element DataFrameRows" @test Base.IndexStyle(eachrow(df)) == IndexLinear() @test eachrow(df)[1] == DataFrameRow(df, 1, :) + @test eachrow(df)[CartesianIndex(1)] == DataFrameRow(df, 1, :) + @test_throws MethodError eachrow(df)[CartesianIndex(1, 1)] @test collect(eachrow(df)) isa Vector{<:DataFrameRow} @test eltype(eachrow(df)) <: DataFrameRow for row in eachrow(df) @@ -35,6 +37,8 @@ using Test, DataFrames @test_throws ArgumentError size(eachcol(df), 2) @test_throws ArgumentError size(eachcol(df), 0) @test eachcol(df)[1] == df[:, 1] + @test eachcol(df)[CartesianIndex(1)] == df[:, 1] + @test_throws MethodError eachcol(df)[CartesianIndex(1, 1)] @test eachcol(df)[:A] === df[!, :A] @test eachcol(df)[All()] == eachcol(df) @test eachcol(df)[Cols(:)] == eachcol(df) diff --git a/test/join.jl b/test/join.jl index 478cca98d..0453d3b63 100644 --- a/test/join.jl +++ b/test/join.jl @@ -84,7 +84,7 @@ anti = left[Bool[ismissing(x) for x in left.Job], [:ID, :Name]] @test_throws ArgumentError crossjoin(df1, df2, renamecols=(x -> "a") => x -> "a") @test crossjoin(df1, df2, renamecols=(x -> "a") => x -> "a", makeunique=true) == rename(cross, [:a, :a_1, :a_2]) - + # Cross joins handle naming collisions @test size(crossjoin(df1, df1, makeunique=true)) == (4, 4) @@ -2176,7 +2176,7 @@ end @test_throws ArgumentError outerjoin(df1, df2, on=:x, order=:x) end -@time @testset "randomized join tests with sort" begin +@testset "randomized join tests with sort" begin Random.seed!(1234) for lenl in 0:20, lenr in 0:20, rep in 1:10 df1 = DataFrame(x=rand(0:lenl, lenl), id1=1:lenl) @@ -2221,7 +2221,7 @@ end @testset "wide joins" begin Random.seed!(1234) # we need many repetitions to make sure we cover all cases - @time for _ in 1:1000, k in 2:4 + for _ in 1:1000, k in 2:4 dfs = [(n=rand(10:20); DataFrame("id" => randperm(n), "x$i" => 1:n)) for i in 1:4] @test issorted(innerjoin(dfs..., on="id", order=:left)[:, 2]) @@ -2232,9 +2232,9 @@ end dfs = [DataFrame("id" => 0, "x$i" => i) for i in 1:10000] res = innerjoin(dfs..., on="id") - @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) + @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) res = outerjoin(dfs..., on="id") - @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) + @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) end end # module diff --git a/test/select.jl b/test/select.jl index 67f97df2f..3a8ad3b23 100644 --- a/test/select.jl +++ b/test/select.jl @@ -3024,4 +3024,22 @@ end @test_throws ArgumentError combine(gdf, :x => (x -> x[1] == 2 ? "x" : cr) => AsTable) end +@testset "empty vector" begin + df = DataFrame(a=1:3) + + @test_throws ArgumentError select(df, :a => (x -> Vector{Any}[])) + + for T in (Vector{Any}, Any, NamedTuple{(:x,),Tuple{Int64}}) + v = combine(df, :a => (x -> T[])).a_function + @test isempty(v) + @test eltype(v) === T + end + + @test size(combine(df, :a => (x -> Vector{Any}[]) => AsTable)) == (0, 0) + @test size(combine(df, :a => (x -> Any[]) => AsTable)) == (0, 0) + df2 = combine(df, :a => (x -> NamedTuple{(:x,),Tuple{Int64}}[]) => AsTable) + @test size(df2) == (0, 1) + @test eltype(df2.x) === Int +end + end # module