From 84e1a0f0d0693d76d4c2e87bd5a9d86968d39098 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 31 Dec 2022 17:28:06 +0100
Subject: [PATCH 01/12] add keep to nonunique, unique, and unique!

---
 NEWS.md                                    |   3 +
 src/DataFrames.jl                          |   1 +
 src/abstractdataframe/abstractdataframe.jl | 276 +---------------
 src/abstractdataframe/unique.jl            | 349 +++++++++++++++++++++
 src/groupeddataframe/groupeddataframe.jl   |   6 +-
 src/groupeddataframe/utils.jl              |  55 ++--
 test/data.jl                               |  56 ----
 test/duplicates.jl                         | 121 ++++++-
 8 files changed, 506 insertions(+), 361 deletions(-)
 create mode 100644 src/abstractdataframe/unique.jl

diff --git a/NEWS.md b/NEWS.md
index da12048624..39aee15a8f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,6 +15,9 @@
 * Joining functions now support `order` keyword argument allowing the user
   to specify the order of the rows in the produced table
   ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
+* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!`
+  allowing to specify which duplicate rows should be kept
+  ([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260))
 
 ## Bug fixes
 
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index c5d8366214..a2a652154a 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -134,6 +134,7 @@ include("other/utils.jl")
 include("other/index.jl")
 
 include("abstractdataframe/abstractdataframe.jl")
+include("abstractdataframe/unique.jl")
 include("dataframe/dataframe.jl")
 include("subdataframe/subdataframe.jl")
 include("dataframerow/dataframerow.jl")
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 9fba690d49..157cf4bf17 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -1342,278 +1342,6 @@ end
 Base.Array(df::AbstractDataFrame) = Matrix(df)
 Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df)
 
-"""
-    nonunique(df::AbstractDataFrame)
-    nonunique(df::AbstractDataFrame, cols)
-
-Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
-A row is a duplicate if there exists a prior row with all columns containing
-equal values (according to `isequal`).
-
-See also [`unique`](@ref) and [`unique!`](@ref).
-
-# Arguments
-- `df` : `AbstractDataFrame`
-- `cols` : a selector specifying the column(s) or their transformations to compare.
-  Can be any column selector or transformation accepted by [`select`](@ref) that
-  returns at least one column if `df` has at least one column.
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> df = vcat(df, df)
-8×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-   5 │     1      1
-   6 │     2      2
-   7 │     3      1
-   8 │     4      2
-
-julia> nonunique(df)
-8-element Vector{Bool}:
- 0
- 0
- 0
- 0
- 1
- 1
- 1
- 1
-
-julia> nonunique(df, 2)
-8-element Vector{Bool}:
- 0
- 0
- 1
- 1
- 1
- 1
- 1
- 1
-```
-"""
-function nonunique(df::AbstractDataFrame)
-    ncol(df) == 0 && return Bool[]
-    gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3]
-    # unique rows are the first encountered group representatives,
-    # nonunique are everything else
-    res = fill(true, nrow(df))
-    @inbounds for g_row in gslots
-        (g_row > 0) && (res[g_row] = false)
-    end
-    return res
-end
-
-function nonunique(df::AbstractDataFrame, cols)
-    udf = _try_select_no_copy(df, cols)
-    if ncol(df) > 0 && ncol(udf) == 0
-         throw(ArgumentError("finding duplicate rows in data frame when " *
-                             "`cols` selects no columns is not allowed"))
-    else
-        return nonunique(udf)
-    end
-end
-
-"""
-    allunique(df::AbstractDataFrame, cols=:)
-
-Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
-all their columns contain equal values (according to `isequal`).
-
-See also [`unique`](@ref) and [`nonunique`](@ref).
-
-# Arguments
-- `df` : `AbstractDataFrame`
-- `cols` : a selector specifying the column(s) or their transformations to compare.
-  Can be any column selector or transformation accepted by [`select`](@ref).
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> allunique(df)
-true
-
-julia> allunique(df, :x)
-false
-
-julia> allunique(df, :i => ByRow(isodd))
-false
-```
-"""
-function Base.allunique(df::AbstractDataFrame, cols=:)
-    udf = _try_select_no_copy(df, cols)
-    nrow(udf) == 0 && return true
-    return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
-                           Val(false), nothing, false, nothing)[1] == nrow(df)
-end
-
-"""
-    unique(df::AbstractDataFrame; view::Bool=false)
-    unique(df::AbstractDataFrame, cols; view::Bool=false)
-
-Return a data frame containing only the first occurrence of unique rows in `df`.
-When `cols` is specified, the returned `DataFrame` contains complete rows,
-retaining in each case the first occurrence of a given combination of values
-in selected columns or their transformations. `cols` can be any column
-selector or transformation accepted by [`select`](@ref).
-
-If `view=false` a freshly allocated `DataFrame` is returned,
-and if `view=true` then a `SubDataFrame` view into `df` is returned.
-
-# Arguments
-- `df` : the AbstractDataFrame
-- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
-specifying the column(s) to compare.
-
-$METADATA_FIXED
-
-See also: [`unique!`](@ref), [`nonunique`](@ref).
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> df = vcat(df, df)
-8×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-   5 │     1      1
-   6 │     2      2
-   7 │     3      1
-   8 │     4      2
-
-julia> unique(df)   # doesn't modify df
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> unique(df, 2)
-2×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-```
-"""
-@inline function Base.unique(df::AbstractDataFrame; view::Bool=false)
-    rowidxs = (!).(nonunique(df))
-    return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
-end
-
-@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false)
-    rowidxs = (!).(nonunique(df, cols))
-    return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
-end
-
-"""
-    unique!(df::AbstractDataFrame)
-    unique!(df::AbstractDataFrame, cols)
-
-Update `df` in-place to contain only the first occurrence of unique rows in `df`.
-When `cols` is specified, the returned `DataFrame` contains complete rows,
-retaining in each case the first occurrence of a given combination of values
-in selected columns or their transformations. `cols` can be any column
-selector or transformation accepted by [`select`](@ref).
-
-# Arguments
-- `df` : the AbstractDataFrame
-- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
-specifying the column(s) to compare.
-
-$METADATA_FIXED
-
-See also: [`unique!`](@ref), [`nonunique`](@ref).
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> df = vcat(df, df)
-8×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-   5 │     1      1
-   6 │     2      2
-   7 │     3      1
-   8 │     4      2
-
-julia> unique!(df)  # modifies df
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-```
-"""
-Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df)))
-Base.unique!(df::AbstractDataFrame, cols::AbstractVector) =
-    deleteat!(df, _findall(nonunique(df, cols)))
-Base.unique!(df::AbstractDataFrame, cols) =
-    deleteat!(df, _findall(nonunique(df, cols)))
-
 """
     fillcombinations(df::AbstractDataFrame, indexcols;
                          allowduplicates::Bool=false,
@@ -1676,8 +1404,8 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
                             "must be specified"))
     end
 
-    has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)),
-                                     Val(false), nothing, false, nothing)[1] != nrow(df)
+    has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
+                                      Val(false), nothing, false, nothing)[1] != nrow(df)
     if has_duplicates && !allowduplicates
         throw(ArgumentError("duplicate combinations of `indexcols` are not " *
                             "allowed in input when `allowduplicates=false`"))
diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
new file mode 100644
index 0000000000..695e58570c
--- /dev/null
+++ b/src/abstractdataframe/unique.jl
@@ -0,0 +1,349 @@
+"""
+    nonunique(df::AbstractDataFrame; keep::Symbol=:first)
+    nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first)
+
+Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
+
+If `keep=:first` (the default) a row is a duplicate if there exists a prior
+row with all columns containing equal values (according to `isequal`).
+
+If `keep=:last` a row is a duplicate if there exists a subsequent row with all
+columns containing equal values (according to `isequal`).
+
+If `keep=:only` a row is a duplicate if there exists any other row with all
+columns containing equal values (according to `isequal`).
+
+See also [`unique`](@ref) and [`unique!`](@ref).
+
+# Arguments
+- `df` : `AbstractDataFrame`
+- `cols` : a selector specifying the column(s) or their transformations to
+  compare. Can be any column selector or transformation accepted by
+  [`select`](@ref) that returns at least one column if `df` has at least one
+  column.
+
+# Examples
+
+```jldoctest
+julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> df = vcat(df, df)
+8×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+   5 │     1      1
+   6 │     2      2
+   7 │     3      1
+   8 │     4      2
+
+julia> nonunique(df)
+8-element Vector{Bool}:
+ 0
+ 0
+ 0
+ 0
+ 1
+ 1
+ 1
+ 1
+
+julia> nonunique(df, keep=:last)
+8-element Vector{Bool}:
+ 1
+ 1
+ 1
+ 1
+ 0
+ 0
+ 0
+ 0
+
+julia> nonunique(df, 2)
+8-element Vector{Bool}:
+ 0
+ 0
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+```
+"""
+function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
+    if !(keep in (:first, :last, :only))
+        throw(ArgumentError("`keep` must be :first, :last, or :none"))
+    end
+    ncol(df) == 0 && return Bool[]
+    res = fill(true, nrow(df))
+    if keep == :first
+        gslots = row_group_slots!(ntuple(i -> df[!, i], ncol(df)), Val(false),
+                                  nothing, false, nothing)[3]
+        # unique rows are the first encountered group representatives,
+        # nonunique are everything else
+        @inbounds for g_row in gslots
+            (g_row > 0) && (res[g_row] = false)
+        end
+        return res
+    else
+        # TODO: this can be potentially optimized in the future,
+        #       but the use of this code is expected to be rare
+        #       so currently a simple implementation is provided
+        #       that is already visibly faster than using groupby and combine 
+        gdf = groupby(df, All())
+        idx = gdf.idx
+        @assert length(gdf.starts) == length(gdf.ends)
+        if keep == :last
+            for (s, e) in zip(gdf.starts, gdf.ends)
+                # keep last index in a group
+                res[idx[e]] = false
+            end
+        else
+            @assert keep == :only
+            for (s, e) in zip(gdf.starts, gdf.ends)
+                # set to false if s == e
+                res[idx[e]] = s != e
+            end
+        end
+    end
+    return res
+end
+
+function nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first)
+    udf = _try_select_no_copy(df, cols)
+    if ncol(df) > 0 && ncol(udf) == 0
+         throw(ArgumentError("finding duplicate rows in data frame when " *
+                             "`cols` selects no columns is not allowed"))
+    else
+        return nonunique(udf, keep=keep)
+    end
+end
+
+"""
+    allunique(df::AbstractDataFrame, cols=:)
+
+Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
+all their columns contain equal values (according to `isequal`).
+
+See also [`unique`](@ref) and [`nonunique`](@ref).
+
+# Arguments
+- `df` : `AbstractDataFrame`
+- `cols` : a selector specifying the column(s) or their transformations to compare.
+  Can be any column selector or transformation accepted by [`select`](@ref).
+
+# Examples
+
+```jldoctest
+julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> allunique(df)
+true
+
+julia> allunique(df, :x)
+false
+
+julia> allunique(df, :i => ByRow(isodd))
+false
+```
+"""
+function Base.allunique(df::AbstractDataFrame, cols=:)
+    udf = _try_select_no_copy(df, cols)
+    nrow(udf) == 0 && return true
+    return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)),
+                            Val(false), nothing, false, nothing)[1] == nrow(df)
+end
+
+"""
+    unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first)
+    unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first)
+
+If `keep=:first` (the default) return a data frame containing only the first
+occurrence of unique rows in `df`.
+
+If `keep=:last` return a data frame containing only the last occurrence of
+unique rows in `df`.
+
+If `keep=:only` return a data frame containing only rows that are unique in `df`
+(in case of duplicate rows all are dropped).
+
+When `cols` is specified, the returned `DataFrame` contains complete rows,
+retaining in each case the first occurrence of a given combination of values
+in selected columns or their transformations. `cols` can be any column
+selector or transformation accepted by [`select`](@ref).
+
+If `view=false` a freshly allocated `DataFrame` is returned,
+and if `view=true` then a `SubDataFrame` view into `df` is returned.
+
+# Arguments
+- `df` : the AbstractDataFrame
+- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
+specifying the column(s) to compare.
+
+$METADATA_FIXED
+
+See also: [`unique!`](@ref), [`nonunique`](@ref).
+
+# Examples
+
+```jldoctest
+julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> df = vcat(df, df)
+8×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+   5 │     1      1
+   6 │     2      2
+   7 │     3      1
+   8 │     4      2
+
+julia> unique(df)   # doesn't modify df
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> unique(df, 2)
+2×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+
+julia> unique(df, keep=:only)
+0×2 DataFrame
+ Row │ i      x     
+     │ Int64  Int64
+─────┴──────────────
+```
+"""
+@inline function Base.unique(df::AbstractDataFrame; view::Bool=false,
+                             keep::Symbol=:first)
+    rowidxs = (!).(nonunique(df, keep=keep))
+    return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
+end
+
+@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false,
+                             keep::Symbol=:first)
+    rowidxs = (!).(nonunique(df, cols, keep=keep))
+    return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
+end
+
+"""
+    unique!(df::AbstractDataFrame; keep::Symbol=:first)
+    unique!(df::AbstractDataFrame, cols; keep::Symbol=:first)
+
+If `keep=:first` (the default) update `df` in place to contain only the first
+occurrence of unique rows in `df`.
+
+If `keep=:last` update `df` in place to contain only the last occurrence of
+unique rows in `df`.
+
+If `keep=:only` update `df` in place to contain only rows that are unique in `df`
+(in case of duplicate rows all are dropped).
+
+When `cols` is specified, the returned `DataFrame` contains complete rows,
+retaining in each case the first occurrence of a given combination of values
+in selected columns or their transformations. `cols` can be any column
+selector or transformation accepted by [`select`](@ref).
+
+# Arguments
+- `df` : the AbstractDataFrame
+- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
+specifying the column(s) to compare.
+
+$METADATA_FIXED
+
+See also: [`unique!`](@ref), [`nonunique`](@ref).
+
+# Examples
+
+```jldoctest
+julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> df = vcat(df, df)
+8×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+   5 │     1      1
+   6 │     2      2
+   7 │     3      1
+   8 │     4      2
+
+julia> unique!(df)  # modifies df
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> unique(df, keep=:only)
+0×2 DataFrame
+ Row │ i      x     
+     │ Int64  Int64
+─────┴──────────────
+```
+"""
+Base.unique!(df::AbstractDataFrame; keep::Symbol=:first) =
+    deleteat!(df, _findall(nonunique(df, keep=keep)))
+Base.unique!(df::AbstractDataFrame, cols::AbstractVector; keep::Symbol=:first) =
+    deleteat!(df, _findall(nonunique(df, cols, keep=keep)))
+Base.unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) =
+    deleteat!(df, _findall(nonunique(df, cols, keep=keep)))
+
diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index f6d4bf9c69..6417c1d68c 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -223,7 +223,7 @@ function groupby(df::AbstractDataFrame, cols;
        (cols isa AbstractVector && any(x -> x isa UserColOrdering, cols))
         if isnothing(sort) || sort === true
             # if sort === true replace it with NamedTuple to avoid sorting
-            # in row_group_slots as we will perform sorting later
+            # in row_group_slots! as we will perform sorting later
             sort = NamedTuple()
         elseif sort === false
             throw(ArgumentError("passing `order` is only allowed if `sort` " *
@@ -248,13 +248,13 @@ function groupby(df::AbstractDataFrame, cols;
 
     groups = Vector{Int}(undef, nrow(df))
     ngroups, rhashes, gslots, sorted =
-        row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
+        row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
                         groups, skipmissing, sort isa NamedTuple ? nothing : sort)
 
     gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing,
                           ngroups, nothing, Threads.ReentrantLock())
 
-    # sort groups if row_group_slots hasn't already done that
+    # sort groups if row_group_slots! hasn't already done that
     if (sort === true && !sorted) || (sort isa NamedTuple)
         # Find index of representative row for each group
         idx = Vector{Int}(undef, length(gd))
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index 3139f30339..a173c0f2f4 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -82,12 +82,12 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
 
 # IntegerRefarray and IntegerRefPool are two complementary view types that allow
 # wrapping arrays with Union{Real, Missing} eltype to satisfy the DataAPI.refpool
-# and DataAPI.refarray API when calling row_group_slots.
+# and DataAPI.refarray API when calling row_group_slots!.
 # IntegerRefarray converts values to Int and replaces missing with an integer
 # (set by the caller to the maximum value + 1)
 # IntegerRefPool subtracts the minimum value - 1 and replaces back the maximum
 # value + 1 to missing. This ensures all values are in 1:length(refpool), while
-# row_group_slots knows the number of (potential) groups via length(refpool)
+# row_group_slots! knows the number of (potential) groups via length(refpool)
 # and is able to skip missing values when skipmissing=true
 
 struct IntegerRefarray{T<:AbstractArray} <: AbstractVector{Int}
@@ -157,7 +157,7 @@ function refpool_and_array(x::AbstractArray)
             minval, maxval = extrema(x)
         end
         ngroups = big(maxval) - big(minval) + 1
-        # Threshold chosen with the same rationale as the row_group_slots refpool method:
+        # Threshold chosen with the same rationale as the row_group_slots! refpool method:
         # refpool approach is faster but we should not allocate too much memory either
         # We also have to avoid overflow, including with ngroups + 1 for missing values
         # (note that it would be possible to allow minval and maxval to be outside of the
@@ -181,11 +181,12 @@ end
 # 4) whether groups are already sorted
 # Optional `groups` vector is set to the group indices of each row (starting at 1)
 # With skipmissing=true, rows with missing values are attributed index 0.
-function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
-                         hash::Val,
-                         groups::Union{Vector{Int}, Nothing},
-                         skipmissing::Bool,
-                         sort::Union{Bool, Nothing})::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
+                          hash::Val,
+                          groups::Union{Vector{Int}, Nothing},
+                          skipmissing::Bool,
+                          sort::Union{Bool, Nothing}
+                         )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     rpa = refpool_and_array.(cols)
     if sort === false
         refpools = nothing
@@ -194,17 +195,17 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
         refpools = first.(rpa)
         refarrays = last.(rpa)
     end
-    row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort === true)
+    row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, sort === true)
 end
 
 # Generic fallback method based on open addressing hash table
-function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
-                         refpools::Any,  # Ignored
-                         refarrays::Any, # Ignored
-                         hash::Val,
-                         groups::Union{Vector{Int}, Nothing},
-                         skipmissing::Bool,
-                         sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
+                          refpools::Any,  # Ignored
+                          refarrays::Any, # Ignored
+                          hash::Val,
+                          groups::Union{Vector{Int}, Nothing},
+                          skipmissing::Bool,
+                          sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     @assert groups === nothing || length(groups) == length(cols[1])
     rhashes, missings = hashrows(cols, skipmissing)
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
@@ -251,16 +252,16 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
 end
 
 # Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector
-function row_group_slots(cols::NTuple{N, AbstractVector},
-                         refpools::NTuple{N, AbstractVector},
-                         refarrays::NTuple{N,
-                             Union{AbstractVector{<:Real},
-                                   Missings.EachReplaceMissing{
-                                       <:AbstractVector{<:Union{Real, Missing}}}}},
-                         hash::Val{false},
-                         groups::Vector{Int},
-                         skipmissing::Bool,
-                         sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
+function row_group_slots!(cols::NTuple{N, AbstractVector},
+                          refpools::NTuple{N, AbstractVector},
+                          refarrays::NTuple{N,
+                              Union{AbstractVector{<:Real},
+                                    Missings.EachReplaceMissing{
+                                        <:AbstractVector{<:Union{Real, Missing}}}}},
+                          hash::Val{false},
+                          groups::Vector{Int},
+                          skipmissing::Bool,
+                          sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
     # Computing neither hashes nor groups isn't very useful,
     # and this method needs to allocate a groups vector anyway
     @assert all(col -> length(col) == length(groups), cols)
@@ -296,7 +297,7 @@ function row_group_slots(cols::NTuple{N, AbstractVector},
         newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) ||
                   !(refarrays isa NTuple{<:Any, AbstractVector}) ||
                   sort ? cols : refarrays
-        return invoke(row_group_slots,
+        return invoke(row_group_slots!,
                       Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val,
                             Union{Vector{Int}, Nothing}, Bool, Bool},
                       newcols, refpools, refarrays, hash, groups, skipmissing, sort)
diff --git a/test/data.jl b/test/data.jl
index 3399ad35e7..b5348c5705 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -229,62 +229,6 @@ end
     @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true)
 end
 
-@testset "nonunique, nonunique, unique! with extra argument" begin
-    df1 = DataFrame(a=Union{String, Missing}["a", "b", "a", "b", "a", "b"],
-                    b=Vector{Union{Int, Missing}}(1:6),
-                    c=Union{Int, Missing}[1:3;1:3])
-    df = vcat(df1, df1)
-    @test findall(nonunique(df)) == collect(7:12)
-    @test findall(nonunique(df, :)) == collect(7:12)
-    @test findall(nonunique(df, Colon())) == collect(7:12)
-    @test findall(nonunique(df, :a)) == collect(3:12)
-    @test findall(nonunique(df, "a")) == collect(3:12)
-    @test findall(nonunique(df, [:a, :c])) == collect(7:12)
-    @test findall(nonunique(df, ["a", "c"])) == collect(7:12)
-    @test findall(nonunique(df, r"[ac]")) == collect(7:12)
-    @test findall(nonunique(df, Not(2))) == collect(7:12)
-    @test findall(nonunique(df, Not([2]))) == collect(7:12)
-    @test findall(nonunique(df, Not(:b))) == collect(7:12)
-    @test findall(nonunique(df, Not([:b]))) == collect(7:12)
-    @test findall(nonunique(df, Not([false, true, false]))) == collect(7:12)
-    @test findall(nonunique(df, [1, 3])) == collect(7:12)
-    @test findall(nonunique(df, 1)) == collect(3:12)
-    @test findall(nonunique(df, :a => x -> 1)) == 2:12
-
-    @test unique(df) == df1
-    @test unique(df, :) == df1
-    @test unique(df, Colon()) == df1
-    @test unique(df, 2:3) == df1
-    @test unique(df, 3) == df1[1:3, :]
-    @test unique(df, [1, 3]) == df1
-    @test unique(df, [:a, :c]) == df1
-    @test unique(df, ["a", "c"]) == df1
-    @test unique(df, r"[ac]") == df1
-    @test unique(df, Not(2)) == df1
-    @test unique(df, Not([2])) == df1
-    @test unique(df, Not(:b)) == df1
-    @test unique(df, Not([:b])) == df1
-    @test unique(df, Not([false, true, false])) == df1
-    @test unique(df, :a) == df1[1:2, :]
-    @test unique(df, "a") == df1[1:2, :]
-    @test unique(df, :a => x -> 1) == df[1:1, :]
-    @test unique(DataFrame()) == DataFrame()
-    @test isempty(nonunique(DataFrame())) && nonunique(DataFrame()) isa Vector{Bool}
-    @test_throws ArgumentError nonunique(DataFrame(a=1:3), [])
-    @test_throws ArgumentError unique(DataFrame(a=1:3), [])
-
-    @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) ==
-          df1[1:2, :]
-
-    unique!(df, [1, 3])
-    @test df == df1
-    for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false]))
-        df = vcat(df1, df1)
-        unique!(df, cols)
-        @test df == df1
-    end
-end
-
 @testset "filter() and filter!()" begin
     df = DataFrame(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
     @test filter(r -> r[:x] > 1, df) == DataFrame(x=[3, 2], y=["b", "a"])
diff --git a/test/duplicates.jl b/test/duplicates.jl
index ec85020c02..e562de54b9 100644
--- a/test/duplicates.jl
+++ b/test/duplicates.jl
@@ -1,6 +1,6 @@
 module TestDuplicates
 
-using Test, DataFrames, CategoricalArrays
+using Test, DataFrames, CategoricalArrays, Random
 const ≅ = isequal
 
 @testset "nonunique" begin
@@ -41,4 +41,123 @@ const ≅ = isequal
     @test_throws ArgumentError unique(pdf, true)
 end
 
+@testset "nonunique, nonunique, unique! with extra argument" begin
+    df1 = DataFrame(a=Union{String, Missing}["a", "b", "a", "b", "a", "b"],
+                    b=Vector{Union{Int, Missing}}(1:6),
+                    c=Union{Int, Missing}[1:3;1:3])
+    df = vcat(df1, df1)
+    @test findall(nonunique(df)) == collect(7:12)
+    @test findall(nonunique(df, :)) == collect(7:12)
+    @test findall(nonunique(df, Colon())) == collect(7:12)
+    @test findall(nonunique(df, :a)) == collect(3:12)
+    @test findall(nonunique(df, "a")) == collect(3:12)
+    @test findall(nonunique(df, [:a, :c])) == collect(7:12)
+    @test findall(nonunique(df, ["a", "c"])) == collect(7:12)
+    @test findall(nonunique(df, r"[ac]")) == collect(7:12)
+    @test findall(nonunique(df, Not(2))) == collect(7:12)
+    @test findall(nonunique(df, Not([2]))) == collect(7:12)
+    @test findall(nonunique(df, Not(:b))) == collect(7:12)
+    @test findall(nonunique(df, Not([:b]))) == collect(7:12)
+    @test findall(nonunique(df, Not([false, true, false]))) == collect(7:12)
+    @test findall(nonunique(df, [1, 3])) == collect(7:12)
+    @test findall(nonunique(df, 1)) == collect(3:12)
+    @test findall(nonunique(df, :a => x -> 1)) == 2:12
+
+    @test unique(df) == df1
+    @test unique(df, :) == df1
+    @test unique(df, Colon()) == df1
+    @test unique(df, 2:3) == df1
+    @test unique(df, 3) == df1[1:3, :]
+    @test unique(df, [1, 3]) == df1
+    @test unique(df, [:a, :c]) == df1
+    @test unique(df, ["a", "c"]) == df1
+    @test unique(df, r"[ac]") == df1
+    @test unique(df, Not(2)) == df1
+    @test unique(df, Not([2])) == df1
+    @test unique(df, Not(:b)) == df1
+    @test unique(df, Not([:b])) == df1
+    @test unique(df, Not([false, true, false])) == df1
+    @test unique(df, :a) == df1[1:2, :]
+    @test unique(df, "a") == df1[1:2, :]
+    @test unique(df, :a => x -> 1) == df[1:1, :]
+    @test unique(DataFrame()) == DataFrame()
+    @test isempty(nonunique(DataFrame())) && nonunique(DataFrame()) isa Vector{Bool}
+    @test_throws ArgumentError nonunique(DataFrame(a=1:3), [])
+    @test_throws ArgumentError unique(DataFrame(a=1:3), [])
+
+    @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) ==
+          df1[1:2, :]
+
+    unique!(df, [1, 3])
+    @test df == df1
+    for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false]))
+        df = vcat(df1, df1)
+        unique!(df, cols)
+        @test df == df1
+    end
+end
+
+@testset "keep argument to nonunique/unique/unique!" begin
+    df = DataFrame(a=[1, 2, 3, 1, 2, 1],
+                   b=["a", "b", "c", "a", "b", "a"],
+                   c=categorical(["a", "b", "c", "a", "b", "a"]))
+    for cols in (1, 2, 3, [1, 2], [1, 3], [2, 3], [1, 2, 3])
+        @test nonunique(df, cols, keep=:first) ==
+              [false, false, false, true, true, true]
+        @test nonunique(df, cols, keep=:last) ==
+              [true, true, false, true, false, false]
+        @test nonunique(df, cols, keep=:only) ==
+              [true, true, false, true, true, true]
+        @test nonunique(select(df, cols), keep=:first) ==
+              [false, false, false, true, true, true]
+        @test nonunique(select(df, cols), keep=:last) ==
+              [true, true, false, true, false, false]
+        @test nonunique(select(df, cols), keep=:only) ==
+              [true, true, false, true, true, true]
+
+        @test unique(df, cols, keep=:first) ==
+              df[.![false, false, false, true, true, true], :]
+        @test unique(df, cols, keep=:last) ==
+              df[.![true, true, false, true, false, false], :]
+        @test unique(df, cols, keep=:only) ==
+              df[.![true, true, false, true, true, true], :]
+        @test unique(select(df, cols), keep=:first) ==
+              df[.![false, false, false, true, true, true], Cols(cols)]
+        @test unique(select(df, cols), keep=:last) ==
+              df[.![true, true, false, true, false, false], Cols(cols)]
+        @test unique(select(df, cols), keep=:only) ==
+              df[.![true, true, false, true, true, true], Cols(cols)]
+
+        @test unique!(copy(df), cols, keep=:first) ==
+              df[.![false, false, false, true, true, true], :]
+        @test unique!(copy(df), cols, keep=:last) ==
+              df[.![true, true, false, true, false, false], :]
+        @test unique!(copy(df), cols, keep=:only) ==
+              df[.![true, true, false, true, true, true], :]
+        @test unique!(select(df, cols), keep=:first) ==
+              df[.![false, false, false, true, true, true], Cols(cols)]
+        @test unique!(select(df, cols), keep=:last) ==
+              df[.![true, true, false, true, false, false], Cols(cols)]
+        @test unique!(select(df, cols), keep=:only) ==
+              df[.![true, true, false, true, true, true], Cols(cols)]
+    end
+
+    # some larger randomized test
+    Random.seed!(1234)
+    df = DataFrame(a=rand(1:10^5, 10^5))
+    df.b = string.(df.a)
+    df.c = categorical(df.b)
+    df.id = 1:10^5
+
+    for cols in (1, 2, 3, [1, 2], [1, 3], [2, 3], [1, 2, 3])
+        @test select(unique(df, cols, keep=:first), cols, Not(cols)) ==
+              combine(groupby(df, cols, sort=false), first)
+        @test select(unique(df, cols, keep=:last), cols, Not(cols)) ==
+              sort(combine(groupby(df, cols, sort=false), last), :id)
+        @test select(unique(df, cols, keep=:only), cols, Not(cols)) ==
+              sort(combine(groupby(df, cols, sort=false),
+                           sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id)
+    end
+end
+
 end # module

From fde2f2183e946fe736250e2169fdb003d2076ddc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 31 Dec 2022 19:53:05 +0100
Subject: [PATCH 02/12] improve tests and fix docs

---
 src/abstractdataframe/unique.jl | 2 +-
 test/duplicates.jl              | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 695e58570c..aba2c2ba2b 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -323,7 +323,7 @@ julia> df = vcat(df, df)
    7 │     3      1
    8 │     4      2
 
-julia> unique!(df)  # modifies df
+julia> unique!(copy(df))  # modifies df
 4×2 DataFrame
  Row │ i      x
      │ Int64  Int64
diff --git a/test/duplicates.jl b/test/duplicates.jl
index e562de54b9..fa3b246069 100644
--- a/test/duplicates.jl
+++ b/test/duplicates.jl
@@ -158,6 +158,13 @@ end
               sort(combine(groupby(df, cols, sort=false),
                            sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id)
     end
+
+    @test isempty(nonunique(DataFrame(), keep=:first))
+    @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[])
+    @test unique!(DataFrame(), keep=:only) == DataFrame()
+    @test_throws ArgumentError nonunique(DataFrame(), keep=:a)
+    @test_throws ArgumentError unique(DataFrame(), keep=:b)
+    @test_throws ArgumentError unique!(DataFrame(), keep=:c)
 end
 
 end # module

From b6662c1a0465a3efc8a5a25f72ec667427399914 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 1 Jan 2023 18:56:45 +0100
Subject: [PATCH 03/12] improve performance

---
 src/abstractdataframe/unique.jl | 83 +++++++++++++++++++++++++--------
 src/groupeddataframe/utils.jl   |  1 +
 2 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index aba2c2ba2b..45836ca51c 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -89,33 +89,76 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
     end
     ncol(df) == 0 && return Bool[]
     res = fill(true, nrow(df))
+    cols = ntuple(i -> df[!, i], ncol(df))
     if keep == :first
-        gslots = row_group_slots!(ntuple(i -> df[!, i], ncol(df)), Val(false),
-                                  nothing, false, nothing)[3]
-        # unique rows are the first encountered group representatives,
-        # nonunique are everything else
-        @inbounds for g_row in gslots
-            (g_row > 0) && (res[g_row] = false)
+        # if we can take advantage of references pass groups to avoid generating hashes
+        rpa = refpool_and_array.(cols)
+        refpools = first.(rpa)
+        refarrays = last.(rpa)
+        if isnothing(refpools) || isnothing(refarrays)
+            ngroups, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
+                                                     false, nothing)
+            # unique rows are the first encountered group representatives,
+            # nonunique are everything else
+            cseen = 0
+            @inbounds for g_row in gslots
+                if g_row > 0
+                    res[g_row] = false
+                    # this check slows down the process when all rows are unique
+                    # but speeds up when we have duplicates
+                    cseen += 1
+                    cseen == ngroups && break
+                end
+            end
+        else
+            groups = Vector{Int}(undef, nrow(df))
+            ngroups = row_group_slots!(cols, refpools, refarrays,
+                                       Val(false), groups, false, false)[1]
+            seen = fill(false, ngroups)
+            cseen = 0
+            for i in 1:nrow(df)
+                g = groups[i]
+                if !seen[g]
+                    seen[g] = true
+                    res[i] = false
+                    cseen += 1
+                    cseen == ngroups && break
+                end
+            end
         end
-        return res
     else
-        # TODO: this can be potentially optimized in the future,
-        #       but the use of this code is expected to be rare
-        #       so currently a simple implementation is provided
-        #       that is already visibly faster than using groupby and combine 
-        gdf = groupby(df, All())
-        idx = gdf.idx
-        @assert length(gdf.starts) == length(gdf.ends)
+        groups = Vector{Int}(undef, nrow(df))
+        ngroups = row_group_slots!(cols, Val(false), groups, false, nothing)[1]
         if keep == :last
-            for (s, e) in zip(gdf.starts, gdf.ends)
-                # keep last index in a group
-                res[idx[e]] = false
+            seen = fill(false, ngroups)
+            cseen = 0
+            for i in nrow(df):-1:1
+                g = groups[i]
+                if !seen[g]
+                    seen[g] = true
+                    res[i] = false
+                    cseen += 1
+                    cseen == ngroups && break
+                end
             end
         else
             @assert keep == :only
-            for (s, e) in zip(gdf.starts, gdf.ends)
-                # set to false if s == e
-                res[idx[e]] = s != e
+            # -1 indicates that we have not seen the group yet
+            # positive value indicates the first position we have seen the group
+            # 0 indicates that we have seen the group at least twice
+            firstseen = fill(-1, ngroups)
+            for i in 1:nrow(df)
+                g = groups[i]
+                j = firstseen[g]
+                if j == -1
+                    # this is possibly non duplicate row
+                    firstseen[g] = i
+                    res[i] = false
+                elseif j > 0
+                    # the row had duplicate
+                    res[j] = true
+                    firstseen[g] = 0
+                end
             end
         end
     end
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index a173c0f2f4..ae7a8e4013 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -178,6 +178,7 @@ end
 # 2) vector of row hashes (may be empty if hash=Val(false))
 # 3) slot array for a hash map, non-zero values are
 #    the indices of the first row in a group
+#    (returned only if hashes are generated)
 # 4) whether groups are already sorted
 # Optional `groups` vector is set to the group indices of each row (starting at 1)
 # With skipmissing=true, rows with missing values are attributed index 0.

From 7409ba024183b33a5fcbf731a650d824b59b0393 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 1 Jan 2023 22:14:56 +0100
Subject: [PATCH 04/12] fix condition

---
 src/abstractdataframe/unique.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 45836ca51c..7cc9ca45f9 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -95,7 +95,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
         rpa = refpool_and_array.(cols)
         refpools = first.(rpa)
         refarrays = last.(rpa)
-        if isnothing(refpools) || isnothing(refarrays)
+        if any(isnothing, refpools) || any(isnothing, refarrays)
             ngroups, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
                                                      false, nothing)
             # unique rows are the first encountered group representatives,

From e6a9f8e01983539a55cb8d51f4153b48c7a29099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 9 Jan 2023 09:51:26 +0100
Subject: [PATCH 05/12] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/unique.jl | 36 +++++++++++++++------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 7cc9ca45f9..1b4bb9b9eb 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -151,11 +151,11 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
                 g = groups[i]
                 j = firstseen[g]
                 if j == -1
-                    # this is possibly non duplicate row
+                    # this is possibly a non duplicate row
                     firstseen[g] = i
                     res[i] = false
                 elseif j > 0
-                    # the row had duplicate
+                    # the row had a duplicate
                     res[j] = true
                     firstseen[g] = 0
                 end
@@ -170,16 +170,16 @@ function nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first)
     if ncol(df) > 0 && ncol(udf) == 0
          throw(ArgumentError("finding duplicate rows in data frame when " *
                              "`cols` selects no columns is not allowed"))
-    else
-        return nonunique(udf, keep=keep)
     end
+    return nonunique(udf, keep=keep)
 end
 
 """
     allunique(df::AbstractDataFrame, cols=:)
 
-Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
-all their columns contain equal values (according to `isequal`).
+Return `true` if none of the rows of `df` are duplicated. Two rows are duplicates if
+all their columns contain equal values (according to `isequal`)
+for all columns in `cols` (by default, all columns).
 
 See also [`unique`](@ref) and [`nonunique`](@ref).
 
@@ -222,27 +222,23 @@ end
     unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first)
     unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first)
 
-If `keep=:first` (the default) return a data frame containing only the first
-occurrence of unique rows in `df`.
-
-If `keep=:last` return a data frame containing only the last occurrence of
-unique rows in `df`.
-
-If `keep=:only` return a data frame containing only rows that are unique in `df`
-(in case of duplicate rows all are dropped).
+Return a data frame containing only unique rows in `df`.
 
-When `cols` is specified, the returned `DataFrame` contains complete rows,
-retaining in each case the first occurrence of a given combination of values
-in selected columns or their transformations. `cols` can be any column
-selector or transformation accepted by [`select`](@ref).
+Non-unique (duplicate) rows are those for which at least another row contains equal values
+(according to `isequal`) for all columns in `cols` (by default, all columns).
+If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is kept.
+If `keep=:last`, only the last occurrence of a set of duplicate rows is kept.
+If `keep=:only`, only rows without any duplicates are kept.
 
 If `view=false` a freshly allocated `DataFrame` is returned,
 and if `view=true` then a `SubDataFrame` view into `df` is returned.
 
 # Arguments
 - `df` : the AbstractDataFrame
-- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
-specifying the column(s) to compare.
+- `cols` : a selector specifying the column(s) or their transformations to
+  compare. Can be any column selector or transformation accepted by
+  [`select`](@ref) that returns at least one column if `df` has at least one
+  column.
 
 $METADATA_FIXED
 

From 570a80bc926f9ad16bfafe5fc00291ad64d6fe6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 9 Jan 2023 10:06:46 +0100
Subject: [PATCH 06/12] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/unique.jl | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 1b4bb9b9eb..9ff7cab4f7 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -4,14 +4,13 @@
 
 Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
 
-If `keep=:first` (the default) a row is a duplicate if there exists a prior
-row with all columns containing equal values (according to `isequal`).
-
-If `keep=:last` a row is a duplicate if there exists a subsequent row with all
-columns containing equal values (according to `isequal`).
-
-If `keep=:only` a row is a duplicate if there exists any other row with all
-columns containing equal values (according to `isequal`).
+Duplicate rows are those for which at least another row contains equal values
+(according to `isequal`) for all columns in `cols` (by default, all columns).
+If `keep=:first` (the default), only the first occurrence of a set of duplicate rows
+is indicated with a `false` entry.
+If `keep=:last`, only the last occurrence of a set of duplicate rows
+is indicated with a `false` entry.
+If `keep=:only`, only rows without any duplicates are indicated with a `false` entry.
 
 See also [`unique`](@ref) and [`unique!`](@ref).
 

From 0484a135edde8c4699629c7723927e1958c7f08b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 9 Jan 2023 12:22:53 +0100
Subject: [PATCH 07/12] changes after code review

---
 src/abstractdataframe/abstractdataframe.jl |  3 +-
 src/abstractdataframe/unique.jl            | 94 ++++++++++------------
 src/groupeddataframe/groupeddataframe.jl   |  3 +-
 src/groupeddataframe/utils.jl              | 25 ++++--
 test/duplicates.jl                         | 24 ++++--
 5 files changed, 80 insertions(+), 69 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 157cf4bf17..29262019f3 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -1404,8 +1404,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
                             "must be specified"))
     end
 
+    # we use hashing algorithm here, because we assume that the tables we work with are not huge
     has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
-                                      Val(false), nothing, false, nothing)[1] != nrow(df)
+                                      Val(false), nothing, false, nothing, false)[1] != nrow(df)
     if has_duplicates && !allowduplicates
         throw(ArgumentError("duplicate combinations of `indexcols` are not " *
                             "allowed in input when `allowduplicates=false`"))
diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 9ff7cab4f7..705af10a04 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -6,11 +6,12 @@ Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
 
 Duplicate rows are those for which at least another row contains equal values
 (according to `isequal`) for all columns in `cols` (by default, all columns).
-If `keep=:first` (the default), only the first occurrence of a set of duplicate rows
-is indicated with a `false` entry.
-If `keep=:last`, only the last occurrence of a set of duplicate rows
-is indicated with a `false` entry.
-If `keep=:only`, only rows without any duplicates are indicated with a `false` entry.
+If `keep=:first` (the default), only the first occurrence of a set of duplicate
+rows is indicated with a `false` entry.
+If `keep=:last`, only the last occurrence of a set of duplicate rows is
+indicated with a `false` entry.
+If `keep=:nonduplicates`, only rows without any duplicates are indicated with a
+`false` entry.
 
 See also [`unique`](@ref) and [`unique!`](@ref).
 
@@ -83,8 +84,8 @@ julia> nonunique(df, 2)
 ```
 """
 function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
-    if !(keep in (:first, :last, :only))
-        throw(ArgumentError("`keep` must be :first, :last, or :none"))
+    if !(keep in (:first, :last, :nonduplicates))
+        throw(ArgumentError("`keep` must be :first, :last, or :nonduplicates"))
     end
     ncol(df) == 0 && return Bool[]
     res = fill(true, nrow(df))
@@ -95,53 +96,40 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
         refpools = first.(rpa)
         refarrays = last.(rpa)
         if any(isnothing, refpools) || any(isnothing, refarrays)
-            ngroups, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
-                                                     false, nothing)
+            _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
+                                                     false, nothing, true)
             # unique rows are the first encountered group representatives,
             # nonunique are everything else
-            cseen = 0
             @inbounds for g_row in gslots
-                if g_row > 0
-                    res[g_row] = false
-                    # this check slows down the process when all rows are unique
-                    # but speeds up when we have duplicates
-                    cseen += 1
-                    cseen == ngroups && break
-                end
+                g_row > 0 && (res[g_row] = false)
             end
         else
             groups = Vector{Int}(undef, nrow(df))
             ngroups = row_group_slots!(cols, refpools, refarrays,
-                                       Val(false), groups, false, false)[1]
+                                       Val(false), groups, false, false, true)[1]
             seen = fill(false, ngroups)
-            cseen = 0
             for i in 1:nrow(df)
                 g = groups[i]
                 if !seen[g]
                     seen[g] = true
                     res[i] = false
-                    cseen += 1
-                    cseen == ngroups && break
                 end
             end
         end
     else
         groups = Vector{Int}(undef, nrow(df))
-        ngroups = row_group_slots!(cols, Val(false), groups, false, nothing)[1]
+        ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, true)[1]
         if keep == :last
             seen = fill(false, ngroups)
-            cseen = 0
             for i in nrow(df):-1:1
                 g = groups[i]
                 if !seen[g]
                     seen[g] = true
                     res[i] = false
-                    cseen += 1
-                    cseen == ngroups && break
                 end
             end
         else
-            @assert keep == :only
+            @assert keep == :nonduplicates
             # -1 indicates that we have not seen the group yet
             # positive value indicates the first position we have seen the group
             # 0 indicates that we have seen the group at least twice
@@ -176,16 +164,17 @@ end
 """
     allunique(df::AbstractDataFrame, cols=:)
 
-Return `true` if none of the rows of `df` are duplicated. Two rows are duplicates if
-all their columns contain equal values (according to `isequal`)
+Return `true` if none of the rows of `df` are duplicated. Two rows are
+duplicates if all their columns contain equal values (according to `isequal`)
 for all columns in `cols` (by default, all columns).
 
 See also [`unique`](@ref) and [`nonunique`](@ref).
 
 # Arguments
 - `df` : `AbstractDataFrame`
-- `cols` : a selector specifying the column(s) or their transformations to compare.
-  Can be any column selector or transformation accepted by [`select`](@ref).
+- `cols` : a selector specifying the column(s) or their transformations to
+  compare. Can be any column selector or transformation accepted by
+  [`select`](@ref).
 
 # Examples
 
@@ -214,7 +203,7 @@ function Base.allunique(df::AbstractDataFrame, cols=:)
     udf = _try_select_no_copy(df, cols)
     nrow(udf) == 0 && return true
     return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)),
-                            Val(false), nothing, false, nothing)[1] == nrow(df)
+                            Val(false), nothing, false, nothing, false)[1] == nrow(df)
 end
 
 """
@@ -223,14 +212,16 @@ end
 
 Return a data frame containing only unique rows in `df`.
 
-Non-unique (duplicate) rows are those for which at least another row contains equal values
-(according to `isequal`) for all columns in `cols` (by default, all columns).
-If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is kept.
+Non-unique (duplicate) rows are those for which at least another row contains
+equal values (according to `isequal`) for all columns in `cols` (by default,
+all columns).
+If `keep=:first` (the default), only the first occurrence of a set of duplicate
+rows is kept.
 If `keep=:last`, only the last occurrence of a set of duplicate rows is kept.
-If `keep=:only`, only rows without any duplicates are kept.
+If `keep=:nonduplicates`, only rows without any duplicates are kept.
 
-If `view=false` a freshly allocated `DataFrame` is returned,
-and if `view=true` then a `SubDataFrame` view into `df` is returned.
+If `view=false` a freshly allocated `DataFrame` is returned, and if `view=true`
+then a `SubDataFrame` view into `df` is returned.
 
 # Arguments
 - `df` : the AbstractDataFrame
@@ -288,7 +279,7 @@ julia> unique(df, 2)
    1 │     1      1
    2 │     2      2
 
-julia> unique(df, keep=:only)
+julia> unique(df, keep=:nonduplicates)
 0×2 DataFrame
  Row │ i      x     
      │ Int64  Int64
@@ -311,24 +302,23 @@ end
     unique!(df::AbstractDataFrame; keep::Symbol=:first)
     unique!(df::AbstractDataFrame, cols; keep::Symbol=:first)
 
-If `keep=:first` (the default) update `df` in place to contain only the first
-occurrence of unique rows in `df`.
+Update `df` in-place to containi only unique rows.
 
-If `keep=:last` update `df` in place to contain only the last occurrence of
-unique rows in `df`.
-
-If `keep=:only` update `df` in place to contain only rows that are unique in `df`
-(in case of duplicate rows all are dropped).
-
-When `cols` is specified, the returned `DataFrame` contains complete rows,
-retaining in each case the first occurrence of a given combination of values
-in selected columns or their transformations. `cols` can be any column
-selector or transformation accepted by [`select`](@ref).
+Non-unique (duplicate) rows are those for which at least another row contains
+equal values (according to `isequal`) for all columns in `cols` (by default,
+all columns).
+If `keep=:first` (the default), only the first occurrence of a set of duplicate
+rows is kept.
+If `keep=:last`, only the last occurrence of a set of duplicate rows is kept.
+If `keep=:nonduplicates`, only rows without any duplicates are kept.
 
 # Arguments
 - `df` : the AbstractDataFrame
 - `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
-specifying the column(s) to compare.
+  specifying the column(s) to compare. Can be any column selector or
+  transformation accepted by [`select`](@ref) that returns at least one column
+  if `df` has at least one column.
+
 
 $METADATA_FIXED
 
@@ -371,7 +361,7 @@ julia> unique!(copy(df))  # modifies df
    3 │     3      1
    4 │     4      2
 
-julia> unique(df, keep=:only)
+julia> unique(df, keep=:nonduplicates)
 0×2 DataFrame
  Row │ i      x     
      │ Int64  Int64
diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index 6417c1d68c..b0bc59046e 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -249,7 +249,8 @@ function groupby(df::AbstractDataFrame, cols;
     groups = Vector{Int}(undef, nrow(df))
     ngroups, rhashes, gslots, sorted =
         row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
-                        groups, skipmissing, sort isa NamedTuple ? nothing : sort)
+                         groups, skipmissing,
+                         sort isa NamedTuple ? nothing : sort, false)
 
     gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing,
                           ngroups, nothing, Threads.ReentrantLock())
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index ae7a8e4013..47b777ac69 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -182,11 +182,17 @@ end
 # 4) whether groups are already sorted
 # Optional `groups` vector is set to the group indices of each row (starting at 1)
 # With skipmissing=true, rows with missing values are attributed index 0.
+#
+# Also the last argument is nonunique. If it is `true` then groups are not
+# compressed to form a continuous sequence. Normally `false` should be passed
+# as this ensures that returned `ngroups` indeed indicates the number of groups
+# but in `nonunique` we do not use this information so compressing can be skipped
 function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
                           hash::Val,
                           groups::Union{Vector{Int}, Nothing},
                           skipmissing::Bool,
-                          sort::Union{Bool, Nothing}
+                          sort::Union{Bool, Nothing},
+                          nonunique::Bool
                          )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     rpa = refpool_and_array.(cols)
     if sort === false
@@ -196,7 +202,8 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
         refpools = first.(rpa)
         refarrays = last.(rpa)
     end
-    row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, sort === true)
+    row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing,
+                     sort === true, nonunique)
 end
 
 # Generic fallback method based on open addressing hash table
@@ -206,7 +213,8 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
                           hash::Val,
                           groups::Union{Vector{Int}, Nothing},
                           skipmissing::Bool,
-                          sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+                          sort::Bool,
+                          nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     @assert groups === nothing || length(groups) == length(cols[1])
     rhashes, missings = hashrows(cols, skipmissing)
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
@@ -262,7 +270,8 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
                           hash::Val{false},
                           groups::Vector{Int},
                           skipmissing::Bool,
-                          sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
+                          sort::Bool,
+                          nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
     # Computing neither hashes nor groups isn't very useful,
     # and this method needs to allocate a groups vector anyway
     @assert all(col -> length(col) == length(groups), cols)
@@ -300,8 +309,8 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
                   sort ? cols : refarrays
         return invoke(row_group_slots!,
                       Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val,
-                            Union{Vector{Int}, Nothing}, Bool, Bool},
-                      newcols, refpools, refarrays, hash, groups, skipmissing, sort)
+                            Union{Vector{Int}, Nothing}, Bool, Bool, Bool},
+                      newcols, refpools, refarrays, hash, groups, skipmissing, sort, nonunique)
     end
 
     strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N, Int}
@@ -430,7 +439,9 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
     # If some groups are unused, compress group indices to drop them
     # sum(seen) is faster than all(seen) when not short-circuiting,
     # and short-circuit would only happen in the slower case anyway
-    if sum(seen) < length(seen)
+    #
+    # This process is not needed if row_group_slots! is called from nonunique
+    if !nonunique && sum(seen) < length(seen)
         oldngroups = ngroups
         remap = zeros(Int, ngroups)
         ngroups = 0
diff --git a/test/duplicates.jl b/test/duplicates.jl
index fa3b246069..9fcd7c546e 100644
--- a/test/duplicates.jl
+++ b/test/duplicates.jl
@@ -106,39 +106,39 @@ end
               [false, false, false, true, true, true]
         @test nonunique(df, cols, keep=:last) ==
               [true, true, false, true, false, false]
-        @test nonunique(df, cols, keep=:only) ==
+        @test nonunique(df, cols, keep=:nonduplicates) ==
               [true, true, false, true, true, true]
         @test nonunique(select(df, cols), keep=:first) ==
               [false, false, false, true, true, true]
         @test nonunique(select(df, cols), keep=:last) ==
               [true, true, false, true, false, false]
-        @test nonunique(select(df, cols), keep=:only) ==
+        @test nonunique(select(df, cols), keep=:nonduplicates) ==
               [true, true, false, true, true, true]
 
         @test unique(df, cols, keep=:first) ==
               df[.![false, false, false, true, true, true], :]
         @test unique(df, cols, keep=:last) ==
               df[.![true, true, false, true, false, false], :]
-        @test unique(df, cols, keep=:only) ==
+        @test unique(df, cols, keep=:nonduplicates) ==
               df[.![true, true, false, true, true, true], :]
         @test unique(select(df, cols), keep=:first) ==
               df[.![false, false, false, true, true, true], Cols(cols)]
         @test unique(select(df, cols), keep=:last) ==
               df[.![true, true, false, true, false, false], Cols(cols)]
-        @test unique(select(df, cols), keep=:only) ==
+        @test unique(select(df, cols), keep=:nonduplicates) ==
               df[.![true, true, false, true, true, true], Cols(cols)]
 
         @test unique!(copy(df), cols, keep=:first) ==
               df[.![false, false, false, true, true, true], :]
         @test unique!(copy(df), cols, keep=:last) ==
               df[.![true, true, false, true, false, false], :]
-        @test unique!(copy(df), cols, keep=:only) ==
+        @test unique!(copy(df), cols, keep=:nonduplicates) ==
               df[.![true, true, false, true, true, true], :]
         @test unique!(select(df, cols), keep=:first) ==
               df[.![false, false, false, true, true, true], Cols(cols)]
         @test unique!(select(df, cols), keep=:last) ==
               df[.![true, true, false, true, false, false], Cols(cols)]
-        @test unique!(select(df, cols), keep=:only) ==
+        @test unique!(select(df, cols), keep=:nonduplicates) ==
               df[.![true, true, false, true, true, true], Cols(cols)]
     end
 
@@ -154,17 +154,25 @@ end
               combine(groupby(df, cols, sort=false), first)
         @test select(unique(df, cols, keep=:last), cols, Not(cols)) ==
               sort(combine(groupby(df, cols, sort=false), last), :id)
-        @test select(unique(df, cols, keep=:only), cols, Not(cols)) ==
+        @test select(unique(df, cols, keep=:nonduplicates), cols, Not(cols)) ==
               sort(combine(groupby(df, cols, sort=false),
                            sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id)
     end
 
     @test isempty(nonunique(DataFrame(), keep=:first))
     @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[])
-    @test unique!(DataFrame(), keep=:only) == DataFrame()
+    @test unique!(DataFrame(), keep=:nonduplicates) == DataFrame()
     @test_throws ArgumentError nonunique(DataFrame(), keep=:a)
     @test_throws ArgumentError unique(DataFrame(), keep=:b)
     @test_throws ArgumentError unique!(DataFrame(), keep=:c)
 end
 
+@testset "case when groups are not compressed in row_group_slots!" begin
+   df = DataFrame(x=repeat([1:1000; -1], 2));
+   @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [-1; 1:1000]
+   @test nonunique(df, :x) == [falses(1001); trues(1001)]
+   @test nonunique(df, :x, keep=:last) == [trues(1001); falses(1001)]
+   @test all(nonunique(df, :x, keep=:nonduplicates))
+end
+
 end # module

From 1caa657a78c223037e69da5fcfeb0ec1087adbaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 9 Jan 2023 15:03:26 +0100
Subject: [PATCH 08/12] small fixes

---
 src/abstractdataframe/unique.jl | 19 +++++++++----------
 test/duplicates.jl              | 18 +++++++++---------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 705af10a04..f7fc1daeb3 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -13,8 +13,6 @@ indicated with a `false` entry.
 If `keep=:nonduplicates`, only rows without any duplicates are indicated with a
 `false` entry.
 
-See also [`unique`](@ref) and [`unique!`](@ref).
-
 # Arguments
 - `df` : `AbstractDataFrame`
 - `cols` : a selector specifying the column(s) or their transformations to
@@ -22,6 +20,8 @@ See also [`unique`](@ref) and [`unique!`](@ref).
   [`select`](@ref) that returns at least one column if `df` has at least one
   column.
 
+See also [`unique`](@ref) and [`unique!`](@ref).
+
 # Examples
 
 ```jldoctest
@@ -97,7 +97,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
         refarrays = last.(rpa)
         if any(isnothing, refpools) || any(isnothing, refarrays)
             _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
-                                                     false, nothing, true)
+                                               false, nothing, true)
             # unique rows are the first encountered group representatives,
             # nonunique are everything else
             @inbounds for g_row in gslots
@@ -168,14 +168,14 @@ Return `true` if none of the rows of `df` are duplicated. Two rows are
 duplicates if all their columns contain equal values (according to `isequal`)
 for all columns in `cols` (by default, all columns).
 
-See also [`unique`](@ref) and [`nonunique`](@ref).
-
 # Arguments
 - `df` : `AbstractDataFrame`
 - `cols` : a selector specifying the column(s) or their transformations to
   compare. Can be any column selector or transformation accepted by
   [`select`](@ref).
 
+See also [`unique`](@ref) and [`nonunique`](@ref).
+
 # Examples
 
 ```jldoctest
@@ -218,7 +218,7 @@ all columns).
 If `keep=:first` (the default), only the first occurrence of a set of duplicate
 rows is kept.
 If `keep=:last`, only the last occurrence of a set of duplicate rows is kept.
-If `keep=:nonduplicates`, only rows without any duplicates are kept.
+If `keep=:noduplicates`, only rows without any duplicates are kept.
 
 If `view=false` a freshly allocated `DataFrame` is returned, and if `view=true`
 then a `SubDataFrame` view into `df` is returned.
@@ -279,7 +279,7 @@ julia> unique(df, 2)
    1 │     1      1
    2 │     2      2
 
-julia> unique(df, keep=:nonduplicates)
+julia> unique(df, keep=:noduplicates)
 0×2 DataFrame
  Row │ i      x     
      │ Int64  Int64
@@ -310,7 +310,7 @@ all columns).
 If `keep=:first` (the default), only the first occurrence of a set of duplicate
 rows is kept.
 If `keep=:last`, only the last occurrence of a set of duplicate rows is kept.
-If `keep=:nonduplicates`, only rows without any duplicates are kept.
+If `keep=:noduplicates`, only rows without any duplicates are kept.
 
 # Arguments
 - `df` : the AbstractDataFrame
@@ -319,7 +319,6 @@ If `keep=:nonduplicates`, only rows without any duplicates are kept.
   transformation accepted by [`select`](@ref) that returns at least one column
   if `df` has at least one column.
 
-
 $METADATA_FIXED
 
 See also: [`unique!`](@ref), [`nonunique`](@ref).
@@ -361,7 +360,7 @@ julia> unique!(copy(df))  # modifies df
    3 │     3      1
    4 │     4      2
 
-julia> unique(df, keep=:nonduplicates)
+julia> unique(df, keep=:noduplicates)
 0×2 DataFrame
  Row │ i      x     
      │ Int64  Int64
diff --git a/test/duplicates.jl b/test/duplicates.jl
index 9fcd7c546e..61c01874d2 100644
--- a/test/duplicates.jl
+++ b/test/duplicates.jl
@@ -106,39 +106,39 @@ end
               [false, false, false, true, true, true]
         @test nonunique(df, cols, keep=:last) ==
               [true, true, false, true, false, false]
-        @test nonunique(df, cols, keep=:nonduplicates) ==
+        @test nonunique(df, cols, keep=:noduplicates) ==
               [true, true, false, true, true, true]
         @test nonunique(select(df, cols), keep=:first) ==
               [false, false, false, true, true, true]
         @test nonunique(select(df, cols), keep=:last) ==
               [true, true, false, true, false, false]
-        @test nonunique(select(df, cols), keep=:nonduplicates) ==
+        @test nonunique(select(df, cols), keep=:noduplicates) ==
               [true, true, false, true, true, true]
 
         @test unique(df, cols, keep=:first) ==
               df[.![false, false, false, true, true, true], :]
         @test unique(df, cols, keep=:last) ==
               df[.![true, true, false, true, false, false], :]
-        @test unique(df, cols, keep=:nonduplicates) ==
+        @test unique(df, cols, keep=:noduplicates) ==
               df[.![true, true, false, true, true, true], :]
         @test unique(select(df, cols), keep=:first) ==
               df[.![false, false, false, true, true, true], Cols(cols)]
         @test unique(select(df, cols), keep=:last) ==
               df[.![true, true, false, true, false, false], Cols(cols)]
-        @test unique(select(df, cols), keep=:nonduplicates) ==
+        @test unique(select(df, cols), keep=:noduplicates) ==
               df[.![true, true, false, true, true, true], Cols(cols)]
 
         @test unique!(copy(df), cols, keep=:first) ==
               df[.![false, false, false, true, true, true], :]
         @test unique!(copy(df), cols, keep=:last) ==
               df[.![true, true, false, true, false, false], :]
-        @test unique!(copy(df), cols, keep=:nonduplicates) ==
+        @test unique!(copy(df), cols, keep=:noduplicates) ==
               df[.![true, true, false, true, true, true], :]
         @test unique!(select(df, cols), keep=:first) ==
               df[.![false, false, false, true, true, true], Cols(cols)]
         @test unique!(select(df, cols), keep=:last) ==
               df[.![true, true, false, true, false, false], Cols(cols)]
-        @test unique!(select(df, cols), keep=:nonduplicates) ==
+        @test unique!(select(df, cols), keep=:noduplicates) ==
               df[.![true, true, false, true, true, true], Cols(cols)]
     end
 
@@ -154,14 +154,14 @@ end
               combine(groupby(df, cols, sort=false), first)
         @test select(unique(df, cols, keep=:last), cols, Not(cols)) ==
               sort(combine(groupby(df, cols, sort=false), last), :id)
-        @test select(unique(df, cols, keep=:nonduplicates), cols, Not(cols)) ==
+        @test select(unique(df, cols, keep=:noduplicates), cols, Not(cols)) ==
               sort(combine(groupby(df, cols, sort=false),
                            sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id)
     end
 
     @test isempty(nonunique(DataFrame(), keep=:first))
     @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[])
-    @test unique!(DataFrame(), keep=:nonduplicates) == DataFrame()
+    @test unique!(DataFrame(), keep=:noduplicates) == DataFrame()
     @test_throws ArgumentError nonunique(DataFrame(), keep=:a)
     @test_throws ArgumentError unique(DataFrame(), keep=:b)
     @test_throws ArgumentError unique!(DataFrame(), keep=:c)
@@ -172,7 +172,7 @@ end
    @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [-1; 1:1000]
    @test nonunique(df, :x) == [falses(1001); trues(1001)]
    @test nonunique(df, :x, keep=:last) == [trues(1001); falses(1001)]
-   @test all(nonunique(df, :x, keep=:nonduplicates))
+   @test all(nonunique(df, :x, keep=:noduplicates))
 end
 
 end # module

From c48a96c31eedfd74a04890878756411a557ebe6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 9 Jan 2023 18:10:43 +0100
Subject: [PATCH 09/12] fix typo

---
 src/abstractdataframe/unique.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index f7fc1daeb3..cc2a8d1f1f 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -10,7 +10,7 @@ If `keep=:first` (the default), only the first occurrence of a set of duplicate
 rows is indicated with a `false` entry.
 If `keep=:last`, only the last occurrence of a set of duplicate rows is
 indicated with a `false` entry.
-If `keep=:nonduplicates`, only rows without any duplicates are indicated with a
+If `keep=:noduplicates`, only rows without any duplicates are indicated with a
 `false` entry.
 
 # Arguments
@@ -84,8 +84,8 @@ julia> nonunique(df, 2)
 ```
 """
 function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
-    if !(keep in (:first, :last, :nonduplicates))
-        throw(ArgumentError("`keep` must be :first, :last, or :nonduplicates"))
+    if !(keep in (:first, :last, :noduplicates))
+        throw(ArgumentError("`keep` must be :first, :last, or :noduplicates"))
     end
     ncol(df) == 0 && return Bool[]
     res = fill(true, nrow(df))
@@ -129,7 +129,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
                 end
             end
         else
-            @assert keep == :nonduplicates
+            @assert keep == :noduplicates
             # -1 indicates that we have not seen the group yet
             # positive value indicates the first position we have seen the group
             # 0 indicates that we have seen the group at least twice

From 6212b45ba4d41001e7d716652d5fb0b51068c2a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 26 Jan 2023 16:11:53 +0100
Subject: [PATCH 10/12] change last argument name of row_group_slots!

---
 src/abstractdataframe/abstractdataframe.jl |  3 +--
 src/abstractdataframe/unique.jl            | 13 ++++++-------
 src/groupeddataframe/groupeddataframe.jl   |  2 +-
 src/groupeddataframe/utils.jl              | 21 +++++++++++----------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 29262019f3..1056ed665b 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -1406,7 +1406,7 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
 
     # we use hashing algorithm here, because we assume that the tables we work with are not huge
     has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
-                                      Val(false), nothing, false, nothing, false)[1] != nrow(df)
+                                      Val(false), nothing, false, nothing, true)[1] != nrow(df)
     if has_duplicates && !allowduplicates
         throw(ArgumentError("duplicate combinations of `indexcols` are not " *
                             "allowed in input when `allowduplicates=false`"))
@@ -3131,4 +3131,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta
     r = min(state + itr.n - 1, last_idx)
     return view(itr.c, state:r, :), r + 1
 end
-
diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index cc2a8d1f1f..19cec8be7d 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -97,7 +97,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
         refarrays = last.(rpa)
         if any(isnothing, refpools) || any(isnothing, refarrays)
             _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
-                                               false, nothing, true)
+                                               false, nothing, false)
             # unique rows are the first encountered group representatives,
             # nonunique are everything else
             @inbounds for g_row in gslots
@@ -106,7 +106,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
         else
             groups = Vector{Int}(undef, nrow(df))
             ngroups = row_group_slots!(cols, refpools, refarrays,
-                                       Val(false), groups, false, false, true)[1]
+                                       Val(false), groups, false, false, false)[1]
             seen = fill(false, ngroups)
             for i in 1:nrow(df)
                 g = groups[i]
@@ -118,7 +118,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
         end
     else
         groups = Vector{Int}(undef, nrow(df))
-        ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, true)[1]
+        ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, false)[1]
         if keep == :last
             seen = fill(false, ngroups)
             for i in nrow(df):-1:1
@@ -203,7 +203,7 @@ function Base.allunique(df::AbstractDataFrame, cols=:)
     udf = _try_select_no_copy(df, cols)
     nrow(udf) == 0 && return true
     return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)),
-                            Val(false), nothing, false, nothing, false)[1] == nrow(df)
+                            Val(false), nothing, false, nothing, true)[1] == nrow(df)
 end
 
 """
@@ -281,7 +281,7 @@ julia> unique(df, 2)
 
 julia> unique(df, keep=:noduplicates)
 0×2 DataFrame
- Row │ i      x     
+ Row │ i      x
      │ Int64  Int64
 ─────┴──────────────
 ```
@@ -362,7 +362,7 @@ julia> unique!(copy(df))  # modifies df
 
 julia> unique(df, keep=:noduplicates)
 0×2 DataFrame
- Row │ i      x     
+ Row │ i      x
      │ Int64  Int64
 ─────┴──────────────
 ```
@@ -373,4 +373,3 @@ Base.unique!(df::AbstractDataFrame, cols::AbstractVector; keep::Symbol=:first) =
     deleteat!(df, _findall(nonunique(df, cols, keep=keep)))
 Base.unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) =
     deleteat!(df, _findall(nonunique(df, cols, keep=keep)))
-
diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index b0bc59046e..d08bef7f55 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -250,7 +250,7 @@ function groupby(df::AbstractDataFrame, cols;
     ngroups, rhashes, gslots, sorted =
         row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
                          groups, skipmissing,
-                         sort isa NamedTuple ? nothing : sort, false)
+                         sort isa NamedTuple ? nothing : sort, true)
 
     gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing,
                           ngroups, nothing, Threads.ReentrantLock())
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index 47b777ac69..ade793042c 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -183,16 +183,17 @@ end
 # Optional `groups` vector is set to the group indices of each row (starting at 1)
 # With skipmissing=true, rows with missing values are attributed index 0.
 #
-# Also the last argument is nonunique. If it is `true` then groups are not
-# compressed to form a continuous sequence. Normally `false` should be passed
+# Also the last argument is `compress`. If it is `false` then groups are not
+# compressed to form a continuous sequence. Normally `true` should be passed
 # as this ensures that returned `ngroups` indeed indicates the number of groups
-# but in `nonunique` we do not use this information so compressing can be skipped
+# but e.g. in `nonunique` we do not use this information so compressing
+# can be skipped by passing `compress=false`
 function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
                           hash::Val,
                           groups::Union{Vector{Int}, Nothing},
                           skipmissing::Bool,
                           sort::Union{Bool, Nothing},
-                          nonunique::Bool
+                          compress::Bool
                          )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     rpa = refpool_and_array.(cols)
     if sort === false
@@ -203,7 +204,7 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
         refarrays = last.(rpa)
     end
     row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing,
-                     sort === true, nonunique)
+                     sort === true, compress)
 end
 
 # Generic fallback method based on open addressing hash table
@@ -214,7 +215,7 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
                           groups::Union{Vector{Int}, Nothing},
                           skipmissing::Bool,
                           sort::Bool,
-                          nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+                          compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     @assert groups === nothing || length(groups) == length(cols[1])
     rhashes, missings = hashrows(cols, skipmissing)
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
@@ -271,7 +272,7 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
                           groups::Vector{Int},
                           skipmissing::Bool,
                           sort::Bool,
-                          nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
+                          compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
     # Computing neither hashes nor groups isn't very useful,
     # and this method needs to allocate a groups vector anyway
     @assert all(col -> length(col) == length(groups), cols)
@@ -310,7 +311,7 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
         return invoke(row_group_slots!,
                       Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val,
                             Union{Vector{Int}, Nothing}, Bool, Bool, Bool},
-                      newcols, refpools, refarrays, hash, groups, skipmissing, sort, nonunique)
+                      newcols, refpools, refarrays, hash, groups, skipmissing, sort, compress)
     end
 
     strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N, Int}
@@ -440,8 +441,8 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
     # sum(seen) is faster than all(seen) when not short-circuiting,
     # and short-circuit would only happen in the slower case anyway
     #
-    # This process is not needed if row_group_slots! is called from nonunique
-    if !nonunique && sum(seen) < length(seen)
+    # This process is not needed if row_group_slots! is called with compress=false
+    if compress && sum(seen) < length(seen)
         oldngroups = ngroups
         remap = zeros(Int, ngroups)
         ngroups = 0

From 1bd00b9a8167d9635765ad4e59478e785f271c5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 28 Jan 2023 10:49:23 +0100
Subject: [PATCH 11/12] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/unique.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
index 19cec8be7d..03cddfe74d 100644
--- a/src/abstractdataframe/unique.jl
+++ b/src/abstractdataframe/unique.jl
@@ -91,10 +91,10 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
     res = fill(true, nrow(df))
     cols = ntuple(i -> df[!, i], ncol(df))
     if keep == :first
-        # if we can take advantage of references pass groups to avoid generating hashes
         rpa = refpool_and_array.(cols)
         refpools = first.(rpa)
         refarrays = last.(rpa)
+        # if refarray cannot be used, we can avoid allocating a groups vector
         if any(isnothing, refpools) || any(isnothing, refarrays)
             _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing,
                                                false, nothing, false)
@@ -103,7 +103,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
             @inbounds for g_row in gslots
                 g_row > 0 && (res[g_row] = false)
             end
-        else
+        else # faster refarray method but allocates a groups vector
             groups = Vector{Int}(undef, nrow(df))
             ngroups = row_group_slots!(cols, refpools, refarrays,
                                        Val(false), groups, false, false, false)[1]
@@ -117,6 +117,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
             end
         end
     else
+       # always allocate a group vector, use refarray automatically if possible
         groups = Vector{Int}(undef, nrow(df))
         ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, false)[1]
         if keep == :last

From 0f46347f04d50aa650e84281e71adf35c7734184 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 28 Jan 2023 14:15:41 +0100
Subject: [PATCH 12/12] Update src/groupeddataframe/utils.jl

---
 src/groupeddataframe/utils.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index ade793042c..d8acb7983a 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -193,8 +193,7 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}},
                           groups::Union{Vector{Int}, Nothing},
                           skipmissing::Bool,
                           sort::Union{Bool, Nothing},
-                          compress::Bool
-                         )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+                          compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     rpa = refpool_and_array.(cols)
     if sort === false
         refpools = nothing