require AbstractVector from subset selectors (#2744)

JuliaData · May 4, 2021 · c3083fc · c3083fc · bkamins · May 4, 2021
1 parent 6e35e29
commit c3083fc
Show file tree

Hide file tree

Showing 6 changed files with 331 additions and 240 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,23 @@
-# DataFrames v1.0 Release Notes
+# DataFrames.jl v1.1 Release Notes
+
+## Functionality changes
+
+* make sure `subset` checks if the passed condition function
+  returns a vector of values (in the 1.0 release also returning scalar `true`,
+  `false`, or `missing` was allowed which was unintended and error prone)
+  ([#2744](https://github.com/JuliaData/DataFrames.jl/pull/2744))
+
+
+# DataFrames.jl v1.0.2 Patch Release Notes
+
+## Performance improvements
+
+* fix of performance issue of `groupby` when using multi-threading
+  ([#2736](https://github.com/JuliaData/DataFrames.jl/pull/2736))
+* fix of performance issue of `groupby` when using `PooledVector`
+  ([2733](https://github.com/JuliaData/DataFrames.jl/pull/2733))
+
+# DataFrames.jl v1.0 Release Notes
 
 ## Breaking changes
 
@@ -76,7 +95,7 @@
    [#2574](https://github.com/JuliaData/DataFrames.jl/pull/2574),
    [#2664](https://github.com/JuliaData/DataFrames.jl/pull/2664))
 
-# DataFrames v0.22.7 Release notes
+# DataFrames.jl v0.22.7 Release notes
 
 * `convert` methods from `AbstractDataFrame`, `DataFrameRow` and `GroupKey`
   to `Array`, `Matrix`, `Vector` and `Tuple`, as well as from `AbstractDict` to
@@ -95,7 +114,7 @@
   `Tables.CopiedColumns`
   ([#2656](https://github.com/JuliaData/DataFrames.jl/pull/2656))
 
-# DataFrames v0.22 Release Notes
+# DataFrames.jl v0.22 Release Notes
 
 ## Breaking changes
 

diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "DataFrames"
 uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-version = "1.0.2"
+version = "1.1.0"
 
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
@@ -31,6 +31,20 @@ function _and_missing(x::Any...)
                         "but only true, false, or missing are allowed"))
 end
 
+# we are guaranteed that ByRow returns a vector
+# this workaround is needed for 0-argument ByRow
+assert_bool_vec(fun::ByRow) = fun
+
+function assert_bool_vec(@nospecialize(fun))
+    return function(x...)
+        val = fun(x...)
+        if !(val isa AbstractVector)
+            throw(ArgumentError("functions passed to `subset` must return an AbstractVector."))
+        end
+        return val
+    end
+end
+
 # Note that _get_subset_conditions will have a large compilation time
 # if more than 32 conditions are passed as `args`.
 function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
@@ -39,7 +53,7 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
     conditions = Any[if a isa ColumnIndex
                          a => Symbol(:x, i)
                      elseif a isa Pair{<:Any, <:Base.Callable}
-                         first(a) => last(a) => Symbol(:x, i)
+                         first(a) => assert_bool_vec(last(a)) => Symbol(:x, i)
                      else
                          throw(ArgumentError("condition specifier $a is not supported by `subset`"))
                      end for (i, a) in enumerate(args)]
@@ -71,13 +85,16 @@ end
 
 Return a copy of data frame `df` or parent of `gdf` containing only rows for
 which all values produced by transformation(s) `args` for a given row are `true`.
+All transformations must produce vectors containing `true` or `false` (and
+optionally `missing` if `skipmissing=true`).
 
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
 described for [`select`](@ref).
 
 Note that as opposed to [`filter`](@ref) the `subset` function works on whole
-columns (or all rows in groups for `GroupedDataFrame`).
+columns (and selects rows within groups for `GroupedDataFrame`
+rather than whole groups) and must return a vector.
 
 If `skipmissing=false` (the default) `args` are required to produce vectors
 containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
@@ -174,13 +191,16 @@ end
 
 Update data frame `df` or the parent of `gdf` in place to contain only rows for
 which all values produced by transformation(s) `args` for a given row is `true`.
+All transformations must produce vectors containing `true` or `false` (and
+optionally `missing` if `skipmissing=true`).
 
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
 described for [`select`](@ref).
 
 Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole
-columns (or all rows in groups for `GroupedDataFrame`).
+columns (and selects rows within groups for `GroupedDataFrame` rather than whole
+groups) and must return a vector.
 
 If `skipmissing=false` (the default) `args` are required to produce vectors
 containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
@@ -191,7 +211,10 @@ If `ungroup=false` the resulting data frame is re-grouped based on the same
 grouping columns as `gdf` and a `GroupedDataFrame` is returned.
 
 If `GroupedDataFrame` is subsetted then it must include all groups present in the
-`parent` data frame, like in [`select!`](@ref).
+`parent` data frame, like in [`select!`](@ref). Also note that in general
+the parent of the passed `GroupedDataFrame` is mutated in place, which means
+that the `GroupedDataFrame` is not safe to use as most likely it will
+get corrupted due to row removal in the parent.
 
 See also: [`subset`](@ref), [`filter!`](@ref), [`select!`](@ref)
 

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -3482,238 +3482,6 @@ end
     @test df == df2
 end
 
-@testset "subset and subset!" begin
-    refdf = DataFrame(x = repeat(Any[true, false], 4),
-                      y = repeat([true, false, missing, missing], 2),
-                      z = repeat([1, 2, 3, 3], 2),
-                      id = 1:8)
-
-    for df in (copy(refdf), @view copy(refdf)[1:end-1, :])
-        df2 = copy(df)
-        @test subset(df, :x) ≅ filter(:x => identity, df)
-        @test df ≅ df2
-        @test subset(df, :x) isa DataFrame
-        @test subset(df, :x, view=true) ≅ filter(:x => identity, df)
-        @test subset(df, :x, view=true) isa SubDataFrame
-        @test_throws ArgumentError subset(df, :y)
-        @test_throws ArgumentError subset(df, :y, :x)
-        @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(df, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(df, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(df, :x, :y, skipmissing=true) ≅
-              filter([:x, :y] => (x, y) -> x && y === true, df)
-        @test subset(df, :y, :x, skipmissing=true) ≅
-              filter([:x, :y] => (x, y) -> x && y === true, df)
-        @test subset(df, :x, :y, skipmissing=true, view=true) ≅
-              filter([:x, :y] => (x, y) -> x && y === true, df)
-        @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅
-              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
-        @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅
-              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
-        @test subset(df, :x, :id => ByRow(<(4))) ≅
-              filter([:x, :id] => (x, id) -> x && id < 4, df)
-        @test subset(df, :x, :id => ByRow(<(4)), view=true) ≅
-              filter([:x, :id] => (x, id) -> x && id < 4, df)
-        @test_throws ArgumentError subset(df)
-        @test isempty(subset(df, :x, :x => ByRow(!)))
-        @test_throws ArgumentError subset(df, :x => x -> false, :x => x -> missing)
-        @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing)
-        @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2)
-    end
-
-    for df in (copy(refdf), @view copy(refdf)[1:end-1, :]),
-        gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]])
-        df2 = copy(df)
-        @test subset(gdf, :x) ≅ filter(:x => identity, df)
-        @test df ≅ df2
-        @test subset(gdf, :x) isa DataFrame
-        @test subset(gdf, :x, ungroup=false) ≅
-              groupby_checked(filter(:x => identity, df), :z)
-        @test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame}
-        @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df)
-        @test subset(gdf, :x, view=true) isa SubDataFrame
-        @test subset(gdf, :x, view=true, ungroup=false) ≅
-              groupby_checked(filter(:x => identity, df), :z)
-        @test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame}
-        @test_throws ArgumentError subset(gdf, :y)
-        @test_throws ArgumentError subset(gdf, :y, :x)
-        @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(gdf, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(gdf, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
-        @test subset(gdf, :x, :y, skipmissing=true) ≅
-              filter([:x, :y] => (x, y) -> x && y === true, df)
-        @test subset(gdf, :y, :x, skipmissing=true) ≅
-              filter([:x, :y] => (x, y) -> x && y === true, df)
-        @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅
-              filter([:x, :y] => (x, y) -> x && y === true, df)
-        @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅
-              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
-        @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅
-              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
-        @test subset(gdf, :x, :id => ByRow(<(4))) ≅
-              filter([:x, :id] => (x, id) -> x && id < 4, df)
-        @test subset(gdf, :x, :id => ByRow(<(4)), view=true) ≅
-              filter([:x, :id] => (x, id) -> x && id < 4, df)
-        @test_throws ArgumentError subset(gdf)
-        @test isempty(subset(gdf, :x, :x => ByRow(!)))
-        @test_throws ArgumentError subset(gdf, :x => x -> false, :x => x -> missing)
-        @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing)
-        @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2)
-    end
-
-    df = copy(refdf)
-    @test subset!(df, :x) === df
-    @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf)
-    df = copy(refdf)
-    @test_throws ArgumentError subset!(df, :y)
-    @test df ≅ refdf
-    df = copy(refdf)
-    @test subset!(df, :y, skipmissing=true) === df
-    @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
-    df = copy(refdf)
-    @test subset!(df, :x, :y, skipmissing=true) === df
-    @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅
-          filter([:x, :y] => (x, y) -> x && y === true, refdf)
-    df = copy(refdf)
-    @test subset!(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
-          filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
-    df = copy(refdf)
-    @test subset!(df, :x, :id => ByRow(<(4))) ≅ df ≅
-          filter([:x, :id] => (x, id) -> x && id < 4, refdf)
-    df = copy(refdf)
-    @test_throws ArgumentError subset!(df)
-    df = copy(refdf)
-    @test isempty(subset!(df, :x, :x => ByRow(!)))
-    @test isempty(df)
-
-    df = copy(refdf)
-    @test_throws ArgumentError subset!(df, :x => x -> false, :x => x -> missing)
-    @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing)
-    @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2)
-
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :x) === df
-
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    gdf2 = subset!(gdf, :x, ungroup=false)
-    @test gdf2 isa GroupedDataFrame{DataFrame}
-    @test parent(gdf2) === df
-    @test gdf2 ≅ groupby_checked(df, :z) ≅ groupby_checked(filter(:x => identity, refdf), :z)
-
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test_throws ArgumentError subset!(gdf, :y)
-    @test df ≅ refdf
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :y, skipmissing=true) === df
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :x, :y, skipmissing=true) === df
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
-          filter([:x, :y] => (x, y) -> x && y === true, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
-          filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
-          filter([:x, :id] => (x, id) -> x && id < 4, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test_throws ArgumentError subset!(gdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test isempty(subset!(gdf, :x, :x => ByRow(!)))
-    @test isempty(df)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)
-    @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
-    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
-    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
-
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test_throws ArgumentError subset!(gdf, :y)
-    @test df ≅ refdf
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
-          filter([:x, :y] => (x, y) -> x && y === true, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
-          filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
-          filter([:x, :id] => (x, id) -> x && id < 4, refdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test_throws ArgumentError subset!(gdf)
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test isempty(subset!(gdf, :x, :x => ByRow(!)))
-    @test isempty(df)
-
-    df = copy(refdf)
-    gdf = groupby_checked(df, :z)[[3, 2, 1]]
-    @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
-    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
-    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
-
-    @test_throws ArgumentError subset!(view(refdf, :, :), :x)
-    @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x)
-
-    df = DataFrame(g=[2, 2, 1, 1, 1, 1, 3, 3, 3], x = 1:9)
-    @test subset(df, :x => x -> x .< mean(x)) == DataFrame(g=[2, 2, 1, 1], x = 1:4)
-    @test subset(groupby_checked(df, :g), :x => x -> x .< mean(x)) ==
-          DataFrame(g=[2, 1, 1, 3], x=[1, 3, 4, 7])
-
-    @test_throws ArgumentError subset(df, :x => x -> missing)
-    @test isempty(subset(df, :x => x -> missing, skipmissing=true))
-    @test isempty(subset(df, :x => x -> false))
-    @test subset(df, :x => x -> true) ≅ df
-    @test_throws ArgumentError subset(df, :x => x -> (a=x,))
-    @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable)
-
-    @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y)
-    @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y)
-    @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x)
-    @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y)
-    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y)
-    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y)
-    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x)
-    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y)
-
-    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y, skipmissing=true)
-    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y, skipmissing=true)
-    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x, skipmissing=true)
-    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y, skipmissing=true)
-
-    @test_throws ArgumentError DataFrames._and()
-    @test_throws ArgumentError DataFrames._and_missing()
-end
-
 @testset "make sure we handle idx correctly when groups are reordered" begin
     df = DataFrame(g=[2, 2, 1, 1, 1], id = 1:5)
     @test select(df, :g, :id, :id => ByRow(identity) => :id2) ==

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -29,6 +29,7 @@ my_tests = ["utils.jl",
             "conversions.jl",
             "sort.jl",
             "grouping.jl",
+            "subset.jl",
             "join.jl",
             "iteration.jl",
             "duplicates.jl",