diff --git a/Project.toml b/Project.toml index af3634f991..e8980d9c3b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DataFrames" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "0.22.2" +version = "0.22.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -24,7 +24,6 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] -julia = "1" CategoricalArrays = "0.8.3, 0.9" Compat = "3.17" DataAPI = "1.4" @@ -33,10 +32,12 @@ IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2" PooledArrays = "0.5" PrettyTables = "0.10" -Reexport = "0.1, 0.2" +Reexport = "0.1, 0.2, 1.0" SortingAlgorithms = "0.1, 0.2, 0.3" TableTraits = "0.4, 1" Tables = "1.2" +Unitful = "1" +julia = "1" [extras] DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" @@ -45,6 +46,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d" [targets] -test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"] +test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test", "Unitful"] diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index 4bce0e6c5f..01222553de 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -238,24 +238,23 @@ end function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame) means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd) + z = zero(eltype(incol)) - zero(eltype(means)) + S = typeof((abs2(z) + abs2(z))/2) # !ismissing check is purely an optimization to avoid a copy later - if eltype(means) >: Missing && agg.condf !== !ismissing - T = Union{Missing, real(eltype(means))} - else - T = real(eltype(means)) - end + T = eltype(incol) >: Missing && agg.condf !== !ismissing ? + T = Union{Missing, S} : S res = zeros(T, length(gd)) return groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf, - (x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1), + (x, l) -> l <= 1 ? x/0 : x/(l-1), false, incol, gd) end function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame) outcol = Aggregate(var, agg.condf)(incol, gd) - if eltype(outcol) <: Union{Missing, Rational} - return sqrt.(outcol) - else + if typeof(sqrt(zero(eltype(outcol)))) === eltype(outcol) return map!(sqrt, outcol, outcol) + else + return map(sqrt, outcol) end end diff --git a/test/grouping.jl b/test/grouping.jl index 27d05e7f49..c2f68176cb 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1,6 +1,7 @@ module TestGrouping -using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI +using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI, + Unitful const ≅ = isequal """Check if passed data frames are `isequal` and have the same element types of columns""" @@ -904,8 +905,11 @@ Base.isless(::TestType, ::TestType) = false @testset "combine with aggregation functions (skipmissing=$skip, sort=$sort, indices=$indices)" for skip in (false, true), sort in (false, true), indices in (false, true) Random.seed!(1) - df = DataFrame(a = rand([1:5;missing], 20), x1 = rand(1:100, 20), - x2 = rand(1:100, 20) +im*rand(1:100, 20)) + # 5 is there to ensure we test a single-row group + df = DataFrame(a = [rand([1:4;missing], 19); 5], + x1 = rand(1:100, 20), + x2 = rand(1:100, 20) + im*rand(1:100, 20), + x3 = rand(1:100, 20) .* u"m") for f in (sum, prod, maximum, minimum, mean, var, std, first, last, length) gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) @@ -918,39 +922,39 @@ Base.isless(::TestType, ::TestType) = false for T in (Union{Missing, Int}, Union{Int, Int8}, Union{Missing, Int, Int8}) - df.x3 = Vector{T}(df.x1) + df.x1u = Vector{T}(df.x1) gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) - expected = combine(gd, :x3 => (x -> f(x)) => :y) + res = combine(gd, :x1u => f => :y) + expected = combine(gd, :x1u => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) end f === length && continue - df.x3 = allowmissing(df.x1) - df.x3[1] = missing + df.x1m = allowmissing(df.x1) + df.x1m[1] = missing gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) - expected = combine(gd, :x3 => (x -> f(x)) => :y) + res = combine(gd, :x1m => f => :y) + expected = combine(gd, :x1m => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) - res = combine(gd, :x3 => f∘skipmissing => :y) - expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) + res = combine(gd, :x1m => f∘skipmissing => :y) + expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) # Test reduction over group with only missing values gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - gd[1][:, :x3] .= missing + gd[1][:, :x1m] .= missing if f in (maximum, minimum, first, last) - @test_throws ArgumentError combine(gd, :x3 => f∘skipmissing => :y) + @test_throws ArgumentError combine(gd, :x1m => f∘skipmissing => :y) else - res = combine(gd, :x3 => f∘skipmissing => :y) - expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) + res = combine(gd, :x1m => f∘skipmissing => :y) + expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) end @@ -964,29 +968,95 @@ Base.isless(::TestType, ::TestType) = false expected = combine(gd, :x2 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) + + f === length && continue + + df.x2m = allowmissing(df.x1) + df.x2m[1] = missing + gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) + indices && @test gd.idx !== nothing # Trigger computation of indices + res = combine(gd, :x2m => f => :y) + expected = combine(gd, :x2m => (x -> f(x)) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + res = combine(gd, :x2m => f∘skipmissing => :y) + expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + + # Test reduction over group with only missing values + gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) + indices && @test gd.idx !== nothing # Trigger computation of indices + gd[1][:, :x2m] .= missing + if f in (maximum, minimum, first, last) + @test_throws ArgumentError combine(gd, :x2m => f∘skipmissing => :y) + else + res = combine(gd, :x2m => f∘skipmissing => :y) + expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + end + end + # Test Unitful numbers + for f in (sum, mean, minimum, maximum, var, std, first, last, length) + gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) + indices && @test gd.idx !== nothing # Trigger computation of indices + + res = combine(gd, :x3 => f => :y) + expected = combine(gd, :x3 => (x -> f(x)) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + + f === length && continue + + df.x3m = allowmissing(df.x1) + df.x3m[1] = missing + gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) + indices && @test gd.idx !== nothing # Trigger computation of indices + res = combine(gd, :x3m => f => :y) + expected = combine(gd, :x3m => (x -> f(x)) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + res = combine(gd, :x3m => f∘skipmissing => :y) + expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + + # Test reduction over group with only missing values + gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) + indices && @test gd.idx !== nothing # Trigger computation of indices + gd[1][:, :x3m] .= missing + if f in (maximum, minimum, first, last) + @test_throws ArgumentError combine(gd, :x3m => f∘skipmissing => :y) + else + res = combine(gd, :x3m => f∘skipmissing => :y) + expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + end end # Test CategoricalArray for f in (maximum, minimum, first, last, length), (T, m) in ((Int, false), (Union{Missing, Int}, false), (Union{Missing, Int}, true)) - df.x3 = CategoricalVector{T}(df.x1) - m && (df.x3[1] = missing) + df.x1c = CategoricalVector{T}(df.x1) + m && (df.x1c[1] = missing) gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) - expected = combine(gd, :x3 => (x -> f(x)) => :y) + res = combine(gd, :x1c => f => :y) + expected = combine(gd, :x1c => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) f === length && continue - res = combine(gd, :x3 => f∘skipmissing => :y) - expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) + res = combine(gd, :x1c => f∘skipmissing => :y) + expected = combine(gd, :x1c => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) if m - gd[1][:, :x3] .= missing - @test_throws ArgumentError combine(gd, :x3 => f∘skipmissing => :y) + gd[1][:, :x1c] .= missing + @test_throws ArgumentError combine(gd, :x1c => f∘skipmissing => :y) end end @test combine(gd, :x1 => maximum => :y, :x2 => sum => :z) ≅