Skip to content

Commit

Permalink
add keep to nonunique, unique, and unique! (#3260)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Jan 28, 2023
1 parent 70d1e23 commit 4446a3d
Show file tree
Hide file tree
Showing 8 changed files with 566 additions and 365 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
* Joining functions now support `order` keyword argument allowing the user
to specify the order of the rows in the produced table
([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!`
allowing to specify which duplicate rows should be kept
([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260))

## Bug fixes

Expand Down
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ include("other/utils.jl")
include("other/index.jl")

include("abstractdataframe/abstractdataframe.jl")
include("abstractdataframe/unique.jl")
include("dataframe/dataframe.jl")
include("subdataframe/subdataframe.jl")
include("dataframerow/dataframerow.jl")
Expand Down
277 changes: 3 additions & 274 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1369,278 +1369,6 @@ end
Base.Array(df::AbstractDataFrame) = Matrix(df)
Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df)

"""
nonunique(df::AbstractDataFrame)
nonunique(df::AbstractDataFrame, cols)
Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
A row is a duplicate if there exists a prior row with all columns containing
equal values (according to `isequal`).
See also [`unique`](@ref) and [`unique!`](@ref).
# Arguments
- `df` : `AbstractDataFrame`
- `cols` : a selector specifying the column(s) or their transformations to compare.
Can be any column selector or transformation accepted by [`select`](@ref) that
returns at least one column if `df` has at least one column.
# Examples
```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
julia> df = vcat(df, df)
8×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
5 │ 1 1
6 │ 2 2
7 │ 3 1
8 │ 4 2
julia> nonunique(df)
8-element Vector{Bool}:
0
0
0
0
1
1
1
1
julia> nonunique(df, 2)
8-element Vector{Bool}:
0
0
1
1
1
1
1
1
```
"""
function nonunique(df::AbstractDataFrame)
ncol(df) == 0 && return Bool[]
gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3]
# unique rows are the first encountered group representatives,
# nonunique are everything else
res = fill(true, nrow(df))
@inbounds for g_row in gslots
(g_row > 0) && (res[g_row] = false)
end
return res
end

function nonunique(df::AbstractDataFrame, cols)
udf = _try_select_no_copy(df, cols)
if ncol(df) > 0 && ncol(udf) == 0
throw(ArgumentError("finding duplicate rows in data frame when " *
"`cols` selects no columns is not allowed"))
else
return nonunique(udf)
end
end

"""
allunique(df::AbstractDataFrame, cols=:)
Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
all their columns contain equal values (according to `isequal`).
See also [`unique`](@ref) and [`nonunique`](@ref).
# Arguments
- `df` : `AbstractDataFrame`
- `cols` : a selector specifying the column(s) or their transformations to compare.
Can be any column selector or transformation accepted by [`select`](@ref).
# Examples
```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
julia> allunique(df)
true
julia> allunique(df, :x)
false
julia> allunique(df, :i => ByRow(isodd))
false
```
"""
function Base.allunique(df::AbstractDataFrame, cols=:)
udf = _try_select_no_copy(df, cols)
nrow(udf) == 0 && return true
return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
Val(false), nothing, false, nothing)[1] == nrow(df)
end

"""
unique(df::AbstractDataFrame; view::Bool=false)
unique(df::AbstractDataFrame, cols; view::Bool=false)
Return a data frame containing only the first occurrence of unique rows in `df`.
When `cols` is specified, the returned `DataFrame` contains complete rows,
retaining in each case the first occurrence of a given combination of values
in selected columns or their transformations. `cols` can be any column
selector or transformation accepted by [`select`](@ref).
If `view=false` a freshly allocated `DataFrame` is returned,
and if `view=true` then a `SubDataFrame` view into `df` is returned.
# Arguments
- `df` : the AbstractDataFrame
- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
specifying the column(s) to compare.
$METADATA_FIXED
See also: [`unique!`](@ref), [`nonunique`](@ref).
# Examples
```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
julia> df = vcat(df, df)
8×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
5 │ 1 1
6 │ 2 2
7 │ 3 1
8 │ 4 2
julia> unique(df) # doesn't modify df
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
julia> unique(df, 2)
2×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
```
"""
@inline function Base.unique(df::AbstractDataFrame; view::Bool=false)
rowidxs = (!).(nonunique(df))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false)
rowidxs = (!).(nonunique(df, cols))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

"""
unique!(df::AbstractDataFrame)
unique!(df::AbstractDataFrame, cols)
Update `df` in-place to contain only the first occurrence of unique rows in `df`.
When `cols` is specified, the returned `DataFrame` contains complete rows,
retaining in each case the first occurrence of a given combination of values
in selected columns or their transformations. `cols` can be any column
selector or transformation accepted by [`select`](@ref).
# Arguments
- `df` : the AbstractDataFrame
- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
specifying the column(s) to compare.
$METADATA_FIXED
See also: [`unique!`](@ref), [`nonunique`](@ref).
# Examples
```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
julia> df = vcat(df, df)
8×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
5 │ 1 1
6 │ 2 2
7 │ 3 1
8 │ 4 2
julia> unique!(df) # modifies df
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
```
"""
Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df)))
Base.unique!(df::AbstractDataFrame, cols::AbstractVector) =
deleteat!(df, _findall(nonunique(df, cols)))
Base.unique!(df::AbstractDataFrame, cols) =
deleteat!(df, _findall(nonunique(df, cols)))

"""
fillcombinations(df::AbstractDataFrame, indexcols;
allowduplicates::Bool=false,
Expand Down Expand Up @@ -1703,8 +1431,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
"must be specified"))
end

has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)),
Val(false), nothing, false, nothing)[1] != nrow(df)
# we use hashing algorithm here, because we assume that the tables we work with are not huge
has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
Val(false), nothing, false, nothing, true)[1] != nrow(df)
if has_duplicates && !allowduplicates
throw(ArgumentError("duplicate combinations of `indexcols` are not " *
"allowed in input when `allowduplicates=false`"))
Expand Down
Loading

0 comments on commit 4446a3d

Please sign in to comment.