add keep to nonunique, unique, and unique! (#3260)

JuliaData · Jan 28, 2023 · 4446a3d · 4446a3d
1 parent 70d1e23
commit 4446a3d
Show file tree

Hide file tree

Showing 8 changed files with 566 additions and 365 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -17,6 +17,9 @@
 * Joining functions now support `order` keyword argument allowing the user
   to specify the order of the rows in the produced table
   ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
+* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!`
+  allowing to specify which duplicate rows should be kept
+  ([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260))
 
 ## Bug fixes
 

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -134,6 +134,7 @@ include("other/utils.jl")
 include("other/index.jl")
 
 include("abstractdataframe/abstractdataframe.jl")
+include("abstractdataframe/unique.jl")
 include("dataframe/dataframe.jl")
 include("subdataframe/subdataframe.jl")
 include("dataframerow/dataframerow.jl")

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -1369,278 +1369,6 @@ end
 Base.Array(df::AbstractDataFrame) = Matrix(df)
 Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df)
 
-"""
-    nonunique(df::AbstractDataFrame)
-    nonunique(df::AbstractDataFrame, cols)
-
-Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
-A row is a duplicate if there exists a prior row with all columns containing
-equal values (according to `isequal`).
-
-See also [`unique`](@ref) and [`unique!`](@ref).
-
-# Arguments
-- `df` : `AbstractDataFrame`
-- `cols` : a selector specifying the column(s) or their transformations to compare.
-  Can be any column selector or transformation accepted by [`select`](@ref) that
-  returns at least one column if `df` has at least one column.
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> df = vcat(df, df)
-8×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-   5 │     1      1
-   6 │     2      2
-   7 │     3      1
-   8 │     4      2
-
-julia> nonunique(df)
-8-element Vector{Bool}:
- 0
- 0
- 0
- 0
- 1
- 1
- 1
- 1
-
-julia> nonunique(df, 2)
-8-element Vector{Bool}:
- 0
- 0
- 1
- 1
- 1
- 1
- 1
- 1
-```
-"""
-function nonunique(df::AbstractDataFrame)
-    ncol(df) == 0 && return Bool[]
-    gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3]
-    # unique rows are the first encountered group representatives,
-    # nonunique are everything else
-    res = fill(true, nrow(df))
-    @inbounds for g_row in gslots
-        (g_row > 0) && (res[g_row] = false)
-    end
-    return res
-end
-
-function nonunique(df::AbstractDataFrame, cols)
-    udf = _try_select_no_copy(df, cols)
-    if ncol(df) > 0 && ncol(udf) == 0
-         throw(ArgumentError("finding duplicate rows in data frame when " *
-                             "`cols` selects no columns is not allowed"))
-    else
-        return nonunique(udf)
-    end
-end
-
-"""
-    allunique(df::AbstractDataFrame, cols=:)
-
-Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
-all their columns contain equal values (according to `isequal`).
-
-See also [`unique`](@ref) and [`nonunique`](@ref).
-
-# Arguments
-- `df` : `AbstractDataFrame`
-- `cols` : a selector specifying the column(s) or their transformations to compare.
-  Can be any column selector or transformation accepted by [`select`](@ref).
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> allunique(df)
-true
-
-julia> allunique(df, :x)
-false
-
-julia> allunique(df, :i => ByRow(isodd))
-false
-```
-"""
-function Base.allunique(df::AbstractDataFrame, cols=:)
-    udf = _try_select_no_copy(df, cols)
-    nrow(udf) == 0 && return true
-    return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
-                           Val(false), nothing, false, nothing)[1] == nrow(df)
-end
-
-"""
-    unique(df::AbstractDataFrame; view::Bool=false)
-    unique(df::AbstractDataFrame, cols; view::Bool=false)
-
-Return a data frame containing only the first occurrence of unique rows in `df`.
-When `cols` is specified, the returned `DataFrame` contains complete rows,
-retaining in each case the first occurrence of a given combination of values
-in selected columns or their transformations. `cols` can be any column
-selector or transformation accepted by [`select`](@ref).
-
-If `view=false` a freshly allocated `DataFrame` is returned,
-and if `view=true` then a `SubDataFrame` view into `df` is returned.
-
-# Arguments
-- `df` : the AbstractDataFrame
-- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
-specifying the column(s) to compare.
-
-$METADATA_FIXED
-
-See also: [`unique!`](@ref), [`nonunique`](@ref).
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> df = vcat(df, df)
-8×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-   5 │     1      1
-   6 │     2      2
-   7 │     3      1
-   8 │     4      2
-
-julia> unique(df)   # doesn't modify df
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> unique(df, 2)
-2×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-```
-"""
-@inline function Base.unique(df::AbstractDataFrame; view::Bool=false)
-    rowidxs = (!).(nonunique(df))
-    return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
-end
-
-@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false)
-    rowidxs = (!).(nonunique(df, cols))
-    return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
-end
-
-"""
-    unique!(df::AbstractDataFrame)
-    unique!(df::AbstractDataFrame, cols)
-
-Update `df` in-place to contain only the first occurrence of unique rows in `df`.
-When `cols` is specified, the returned `DataFrame` contains complete rows,
-retaining in each case the first occurrence of a given combination of values
-in selected columns or their transformations. `cols` can be any column
-selector or transformation accepted by [`select`](@ref).
-
-# Arguments
-- `df` : the AbstractDataFrame
-- `cols` :  column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
-specifying the column(s) to compare.
-
-$METADATA_FIXED
-
-See also: [`unique!`](@ref), [`nonunique`](@ref).
-
-# Examples
-
-```jldoctest
-julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-
-julia> df = vcat(df, df)
-8×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-   5 │     1      1
-   6 │     2      2
-   7 │     3      1
-   8 │     4      2
-
-julia> unique!(df)  # modifies df
-4×2 DataFrame
- Row │ i      x
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      1
-   2 │     2      2
-   3 │     3      1
-   4 │     4      2
-```
-"""
-Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df)))
-Base.unique!(df::AbstractDataFrame, cols::AbstractVector) =
-    deleteat!(df, _findall(nonunique(df, cols)))
-Base.unique!(df::AbstractDataFrame, cols) =
-    deleteat!(df, _findall(nonunique(df, cols)))
-
 """
     fillcombinations(df::AbstractDataFrame, indexcols;
                          allowduplicates::Bool=false,
@@ -1703,8 +1431,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
                             "must be specified"))
     end
 
-    has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)),
-                                     Val(false), nothing, false, nothing)[1] != nrow(df)
+    # we use hashing algorithm here, because we assume that the tables we work with are not huge
+    has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
+                                      Val(false), nothing, false, nothing, true)[1] != nrow(df)
     if has_duplicates && !allowduplicates
         throw(ArgumentError("duplicate combinations of `indexcols` are not " *
                             "allowed in input when `allowduplicates=false`"))