Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add keep to nonunique, unique, and unique! #3260

Merged
merged 12 commits into from
Jan 28, 2023
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
* Joining functions now support `order` keyword argument allowing the user
to specify the order of the rows in the produced table
([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!`
allowing to specify which duplicate rows should be kept
([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260))

## Bug fixes

Expand Down
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ include("other/utils.jl")
include("other/index.jl")

include("abstractdataframe/abstractdataframe.jl")
include("abstractdataframe/unique.jl")
include("dataframe/dataframe.jl")
include("subdataframe/subdataframe.jl")
include("dataframerow/dataframerow.jl")
Expand Down
278 changes: 3 additions & 275 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1342,278 +1342,6 @@ end
Base.Array(df::AbstractDataFrame) = Matrix(df)
Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df)

"""
nonunique(df::AbstractDataFrame)
nonunique(df::AbstractDataFrame, cols)

Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
A row is a duplicate if there exists a prior row with all columns containing
equal values (according to `isequal`).

See also [`unique`](@ref) and [`unique!`](@ref).

# Arguments
- `df` : `AbstractDataFrame`
- `cols` : a selector specifying the column(s) or their transformations to compare.
Can be any column selector or transformation accepted by [`select`](@ref) that
returns at least one column if `df` has at least one column.

# Examples

```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2

julia> df = vcat(df, df)
8×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
5 │ 1 1
6 │ 2 2
7 │ 3 1
8 │ 4 2

julia> nonunique(df)
8-element Vector{Bool}:
0
0
0
0
1
1
1
1

julia> nonunique(df, 2)
8-element Vector{Bool}:
0
0
1
1
1
1
1
1
```
"""
function nonunique(df::AbstractDataFrame)
ncol(df) == 0 && return Bool[]
gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3]
# unique rows are the first encountered group representatives,
# nonunique are everything else
res = fill(true, nrow(df))
@inbounds for g_row in gslots
(g_row > 0) && (res[g_row] = false)
end
return res
end

function nonunique(df::AbstractDataFrame, cols)
udf = _try_select_no_copy(df, cols)
if ncol(df) > 0 && ncol(udf) == 0
throw(ArgumentError("finding duplicate rows in data frame when " *
"`cols` selects no columns is not allowed"))
else
return nonunique(udf)
end
end

"""
allunique(df::AbstractDataFrame, cols=:)

Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
all their columns contain equal values (according to `isequal`).

See also [`unique`](@ref) and [`nonunique`](@ref).

# Arguments
- `df` : `AbstractDataFrame`
- `cols` : a selector specifying the column(s) or their transformations to compare.
Can be any column selector or transformation accepted by [`select`](@ref).

# Examples

```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2

julia> allunique(df)
true

julia> allunique(df, :x)
false

julia> allunique(df, :i => ByRow(isodd))
false
```
"""
function Base.allunique(df::AbstractDataFrame, cols=:)
udf = _try_select_no_copy(df, cols)
nrow(udf) == 0 && return true
return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
Val(false), nothing, false, nothing)[1] == nrow(df)
end

"""
unique(df::AbstractDataFrame; view::Bool=false)
unique(df::AbstractDataFrame, cols; view::Bool=false)

Return a data frame containing only the first occurrence of unique rows in `df`.
When `cols` is specified, the returned `DataFrame` contains complete rows,
retaining in each case the first occurrence of a given combination of values
in selected columns or their transformations. `cols` can be any column
selector or transformation accepted by [`select`](@ref).

If `view=false` a freshly allocated `DataFrame` is returned,
and if `view=true` then a `SubDataFrame` view into `df` is returned.

# Arguments
- `df` : the AbstractDataFrame
- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
specifying the column(s) to compare.

$METADATA_FIXED

See also: [`unique!`](@ref), [`nonunique`](@ref).

# Examples

```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2

julia> df = vcat(df, df)
8×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
5 │ 1 1
6 │ 2 2
7 │ 3 1
8 │ 4 2

julia> unique(df) # doesn't modify df
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2

julia> unique(df, 2)
2×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
```
"""
@inline function Base.unique(df::AbstractDataFrame; view::Bool=false)
rowidxs = (!).(nonunique(df))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false)
rowidxs = (!).(nonunique(df, cols))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

"""
unique!(df::AbstractDataFrame)
unique!(df::AbstractDataFrame, cols)

Update `df` in-place to contain only the first occurrence of unique rows in `df`.
When `cols` is specified, the returned `DataFrame` contains complete rows,
retaining in each case the first occurrence of a given combination of values
in selected columns or their transformations. `cols` can be any column
selector or transformation accepted by [`select`](@ref).

# Arguments
- `df` : the AbstractDataFrame
- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
specifying the column(s) to compare.

$METADATA_FIXED

See also: [`unique!`](@ref), [`nonunique`](@ref).

# Examples

```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2

julia> df = vcat(df, df)
8×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
5 │ 1 1
6 │ 2 2
7 │ 3 1
8 │ 4 2

julia> unique!(df) # modifies df
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2
```
"""
Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df)))
Base.unique!(df::AbstractDataFrame, cols::AbstractVector) =
deleteat!(df, _findall(nonunique(df, cols)))
Base.unique!(df::AbstractDataFrame, cols) =
deleteat!(df, _findall(nonunique(df, cols)))

"""
fillcombinations(df::AbstractDataFrame, indexcols;
allowduplicates::Bool=false,
Expand Down Expand Up @@ -1676,8 +1404,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
"must be specified"))
end

has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)),
Val(false), nothing, false, nothing)[1] != nrow(df)
# we use hashing algorithm here, because we assume that the tables we work with are not huge
has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
Val(false), nothing, false, nothing, true)[1] != nrow(df)
if has_duplicates && !allowduplicates
throw(ArgumentError("duplicate combinations of `indexcols` are not " *
"allowed in input when `allowduplicates=false`"))
Expand Down Expand Up @@ -3402,4 +3131,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta
r = min(state + itr.n - 1, last_idx)
return view(itr.c, state:r, :), r + 1
end

Loading