From 84e1a0f0d0693d76d4c2e87bd5a9d86968d39098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Dec 2022 17:28:06 +0100 Subject: [PATCH 01/12] add keep to nonunique, unique, and unique! --- NEWS.md | 3 + src/DataFrames.jl | 1 + src/abstractdataframe/abstractdataframe.jl | 276 +--------------- src/abstractdataframe/unique.jl | 349 +++++++++++++++++++++ src/groupeddataframe/groupeddataframe.jl | 6 +- src/groupeddataframe/utils.jl | 55 ++-- test/data.jl | 56 ---- test/duplicates.jl | 121 ++++++- 8 files changed, 506 insertions(+), 361 deletions(-) create mode 100644 src/abstractdataframe/unique.jl diff --git a/NEWS.md b/NEWS.md index da12048624..39aee15a8f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ * Joining functions now support `order` keyword argument allowing the user to specify the order of the rows in the produced table ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233)) +* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!` + allowing to specify which duplicate rows should be kept + ([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260)) ## Bug fixes diff --git a/src/DataFrames.jl b/src/DataFrames.jl index c5d8366214..a2a652154a 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -134,6 +134,7 @@ include("other/utils.jl") include("other/index.jl") include("abstractdataframe/abstractdataframe.jl") +include("abstractdataframe/unique.jl") include("dataframe/dataframe.jl") include("subdataframe/subdataframe.jl") include("dataframerow/dataframerow.jl") diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 9fba690d49..157cf4bf17 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1342,278 +1342,6 @@ end Base.Array(df::AbstractDataFrame) = Matrix(df) Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df) -""" - nonunique(df::AbstractDataFrame) - nonunique(df::AbstractDataFrame, cols) - -Return a `Vector{Bool}` in which `true` entries indicate duplicate rows. -A row is a duplicate if there exists a prior row with all columns containing -equal values (according to `isequal`). - -See also [`unique`](@ref) and [`unique!`](@ref). - -# Arguments -- `df` : `AbstractDataFrame` -- `cols` : a selector specifying the column(s) or their transformations to compare. - Can be any column selector or transformation accepted by [`select`](@ref) that - returns at least one column if `df` has at least one column. - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> df = vcat(df, df) -8×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 1 1 - 6 │ 2 2 - 7 │ 3 1 - 8 │ 4 2 - -julia> nonunique(df) -8-element Vector{Bool}: - 0 - 0 - 0 - 0 - 1 - 1 - 1 - 1 - -julia> nonunique(df, 2) -8-element Vector{Bool}: - 0 - 0 - 1 - 1 - 1 - 1 - 1 - 1 -``` -""" -function nonunique(df::AbstractDataFrame) - ncol(df) == 0 && return Bool[] - gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3] - # unique rows are the first encountered group representatives, - # nonunique are everything else - res = fill(true, nrow(df)) - @inbounds for g_row in gslots - (g_row > 0) && (res[g_row] = false) - end - return res -end - -function nonunique(df::AbstractDataFrame, cols) - udf = _try_select_no_copy(df, cols) - if ncol(df) > 0 && ncol(udf) == 0 - throw(ArgumentError("finding duplicate rows in data frame when " * - "`cols` selects no columns is not allowed")) - else - return nonunique(udf) - end -end - -""" - allunique(df::AbstractDataFrame, cols=:) - -Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if -all their columns contain equal values (according to `isequal`). - -See also [`unique`](@ref) and [`nonunique`](@ref). - -# Arguments -- `df` : `AbstractDataFrame` -- `cols` : a selector specifying the column(s) or their transformations to compare. - Can be any column selector or transformation accepted by [`select`](@ref). - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> allunique(df) -true - -julia> allunique(df, :x) -false - -julia> allunique(df, :i => ByRow(isodd)) -false -``` -""" -function Base.allunique(df::AbstractDataFrame, cols=:) - udf = _try_select_no_copy(df, cols) - nrow(udf) == 0 && return true - return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)), - Val(false), nothing, false, nothing)[1] == nrow(df) -end - -""" - unique(df::AbstractDataFrame; view::Bool=false) - unique(df::AbstractDataFrame, cols; view::Bool=false) - -Return a data frame containing only the first occurrence of unique rows in `df`. -When `cols` is specified, the returned `DataFrame` contains complete rows, -retaining in each case the first occurrence of a given combination of values -in selected columns or their transformations. `cols` can be any column -selector or transformation accepted by [`select`](@ref). - -If `view=false` a freshly allocated `DataFrame` is returned, -and if `view=true` then a `SubDataFrame` view into `df` is returned. - -# Arguments -- `df` : the AbstractDataFrame -- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) -specifying the column(s) to compare. - -$METADATA_FIXED - -See also: [`unique!`](@ref), [`nonunique`](@ref). - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> df = vcat(df, df) -8×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 1 1 - 6 │ 2 2 - 7 │ 3 1 - 8 │ 4 2 - -julia> unique(df) # doesn't modify df -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> unique(df, 2) -2×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 -``` -""" -@inline function Base.unique(df::AbstractDataFrame; view::Bool=false) - rowidxs = (!).(nonunique(df)) - return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] -end - -@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false) - rowidxs = (!).(nonunique(df, cols)) - return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] -end - -""" - unique!(df::AbstractDataFrame) - unique!(df::AbstractDataFrame, cols) - -Update `df` in-place to contain only the first occurrence of unique rows in `df`. -When `cols` is specified, the returned `DataFrame` contains complete rows, -retaining in each case the first occurrence of a given combination of values -in selected columns or their transformations. `cols` can be any column -selector or transformation accepted by [`select`](@ref). - -# Arguments -- `df` : the AbstractDataFrame -- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) -specifying the column(s) to compare. - -$METADATA_FIXED - -See also: [`unique!`](@ref), [`nonunique`](@ref). - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> df = vcat(df, df) -8×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 1 1 - 6 │ 2 2 - 7 │ 3 1 - 8 │ 4 2 - -julia> unique!(df) # modifies df -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 -``` -""" -Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df))) -Base.unique!(df::AbstractDataFrame, cols::AbstractVector) = - deleteat!(df, _findall(nonunique(df, cols))) -Base.unique!(df::AbstractDataFrame, cols) = - deleteat!(df, _findall(nonunique(df, cols))) - """ fillcombinations(df::AbstractDataFrame, indexcols; allowduplicates::Bool=false, @@ -1676,8 +1404,8 @@ function fillcombinations(df::AbstractDataFrame, indexcols; "must be specified")) end - has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)), - Val(false), nothing, false, nothing)[1] != nrow(df) + has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)), + Val(false), nothing, false, nothing)[1] != nrow(df) if has_duplicates && !allowduplicates throw(ArgumentError("duplicate combinations of `indexcols` are not " * "allowed in input when `allowduplicates=false`")) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl new file mode 100644 index 0000000000..695e58570c --- /dev/null +++ b/src/abstractdataframe/unique.jl @@ -0,0 +1,349 @@ +""" + nonunique(df::AbstractDataFrame; keep::Symbol=:first) + nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first) + +Return a `Vector{Bool}` in which `true` entries indicate duplicate rows. + +If `keep=:first` (the default) a row is a duplicate if there exists a prior +row with all columns containing equal values (according to `isequal`). + +If `keep=:last` a row is a duplicate if there exists a subsequent row with all +columns containing equal values (according to `isequal`). + +If `keep=:only` a row is a duplicate if there exists any other row with all +columns containing equal values (according to `isequal`). + +See also [`unique`](@ref) and [`unique!`](@ref). + +# Arguments +- `df` : `AbstractDataFrame` +- `cols` : a selector specifying the column(s) or their transformations to + compare. Can be any column selector or transformation accepted by + [`select`](@ref) that returns at least one column if `df` has at least one + column. + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> df = vcat(df, df) +8×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + 5 │ 1 1 + 6 │ 2 2 + 7 │ 3 1 + 8 │ 4 2 + +julia> nonunique(df) +8-element Vector{Bool}: + 0 + 0 + 0 + 0 + 1 + 1 + 1 + 1 + +julia> nonunique(df, keep=:last) +8-element Vector{Bool}: + 1 + 1 + 1 + 1 + 0 + 0 + 0 + 0 + +julia> nonunique(df, 2) +8-element Vector{Bool}: + 0 + 0 + 1 + 1 + 1 + 1 + 1 + 1 +``` +""" +function nonunique(df::AbstractDataFrame; keep::Symbol=:first) + if !(keep in (:first, :last, :only)) + throw(ArgumentError("`keep` must be :first, :last, or :none")) + end + ncol(df) == 0 && return Bool[] + res = fill(true, nrow(df)) + if keep == :first + gslots = row_group_slots!(ntuple(i -> df[!, i], ncol(df)), Val(false), + nothing, false, nothing)[3] + # unique rows are the first encountered group representatives, + # nonunique are everything else + @inbounds for g_row in gslots + (g_row > 0) && (res[g_row] = false) + end + return res + else + # TODO: this can be potentially optimized in the future, + # but the use of this code is expected to be rare + # so currently a simple implementation is provided + # that is already visibly faster than using groupby and combine + gdf = groupby(df, All()) + idx = gdf.idx + @assert length(gdf.starts) == length(gdf.ends) + if keep == :last + for (s, e) in zip(gdf.starts, gdf.ends) + # keep last index in a group + res[idx[e]] = false + end + else + @assert keep == :only + for (s, e) in zip(gdf.starts, gdf.ends) + # set to false if s == e + res[idx[e]] = s != e + end + end + end + return res +end + +function nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first) + udf = _try_select_no_copy(df, cols) + if ncol(df) > 0 && ncol(udf) == 0 + throw(ArgumentError("finding duplicate rows in data frame when " * + "`cols` selects no columns is not allowed")) + else + return nonunique(udf, keep=keep) + end +end + +""" + allunique(df::AbstractDataFrame, cols=:) + +Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if +all their columns contain equal values (according to `isequal`). + +See also [`unique`](@ref) and [`nonunique`](@ref). + +# Arguments +- `df` : `AbstractDataFrame` +- `cols` : a selector specifying the column(s) or their transformations to compare. + Can be any column selector or transformation accepted by [`select`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> allunique(df) +true + +julia> allunique(df, :x) +false + +julia> allunique(df, :i => ByRow(isodd)) +false +``` +""" +function Base.allunique(df::AbstractDataFrame, cols=:) + udf = _try_select_no_copy(df, cols) + nrow(udf) == 0 && return true + return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)), + Val(false), nothing, false, nothing)[1] == nrow(df) +end + +""" + unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first) + unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first) + +If `keep=:first` (the default) return a data frame containing only the first +occurrence of unique rows in `df`. + +If `keep=:last` return a data frame containing only the last occurrence of +unique rows in `df`. + +If `keep=:only` return a data frame containing only rows that are unique in `df` +(in case of duplicate rows all are dropped). + +When `cols` is specified, the returned `DataFrame` contains complete rows, +retaining in each case the first occurrence of a given combination of values +in selected columns or their transformations. `cols` can be any column +selector or transformation accepted by [`select`](@ref). + +If `view=false` a freshly allocated `DataFrame` is returned, +and if `view=true` then a `SubDataFrame` view into `df` is returned. + +# Arguments +- `df` : the AbstractDataFrame +- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) +specifying the column(s) to compare. + +$METADATA_FIXED + +See also: [`unique!`](@ref), [`nonunique`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> df = vcat(df, df) +8×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + 5 │ 1 1 + 6 │ 2 2 + 7 │ 3 1 + 8 │ 4 2 + +julia> unique(df) # doesn't modify df +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> unique(df, 2) +2×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + +julia> unique(df, keep=:only) +0×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┴────────────── +``` +""" +@inline function Base.unique(df::AbstractDataFrame; view::Bool=false, + keep::Symbol=:first) + rowidxs = (!).(nonunique(df, keep=keep)) + return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] +end + +@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false, + keep::Symbol=:first) + rowidxs = (!).(nonunique(df, cols, keep=keep)) + return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] +end + +""" + unique!(df::AbstractDataFrame; keep::Symbol=:first) + unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) + +If `keep=:first` (the default) update `df` in place to contain only the first +occurrence of unique rows in `df`. + +If `keep=:last` update `df` in place to contain only the last occurrence of +unique rows in `df`. + +If `keep=:only` update `df` in place to contain only rows that are unique in `df` +(in case of duplicate rows all are dropped). + +When `cols` is specified, the returned `DataFrame` contains complete rows, +retaining in each case the first occurrence of a given combination of values +in selected columns or their transformations. `cols` can be any column +selector or transformation accepted by [`select`](@ref). + +# Arguments +- `df` : the AbstractDataFrame +- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) +specifying the column(s) to compare. + +$METADATA_FIXED + +See also: [`unique!`](@ref), [`nonunique`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> df = vcat(df, df) +8×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + 5 │ 1 1 + 6 │ 2 2 + 7 │ 3 1 + 8 │ 4 2 + +julia> unique!(df) # modifies df +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> unique(df, keep=:only) +0×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┴────────────── +``` +""" +Base.unique!(df::AbstractDataFrame; keep::Symbol=:first) = + deleteat!(df, _findall(nonunique(df, keep=keep))) +Base.unique!(df::AbstractDataFrame, cols::AbstractVector; keep::Symbol=:first) = + deleteat!(df, _findall(nonunique(df, cols, keep=keep))) +Base.unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) = + deleteat!(df, _findall(nonunique(df, cols, keep=keep))) + diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index f6d4bf9c69..6417c1d68c 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -223,7 +223,7 @@ function groupby(df::AbstractDataFrame, cols; (cols isa AbstractVector && any(x -> x isa UserColOrdering, cols)) if isnothing(sort) || sort === true # if sort === true replace it with NamedTuple to avoid sorting - # in row_group_slots as we will perform sorting later + # in row_group_slots! as we will perform sorting later sort = NamedTuple() elseif sort === false throw(ArgumentError("passing `order` is only allowed if `sort` " * @@ -248,13 +248,13 @@ function groupby(df::AbstractDataFrame, cols; groups = Vector{Int}(undef, nrow(df)) ngroups, rhashes, gslots, sorted = - row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), + row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), groups, skipmissing, sort isa NamedTuple ? nothing : sort) gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing, Threads.ReentrantLock()) - # sort groups if row_group_slots hasn't already done that + # sort groups if row_group_slots! hasn't already done that if (sort === true && !sorted) || (sort isa NamedTuple) # Find index of representative row for each group idx = Vector{Int}(undef, length(gd)) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 3139f30339..a173c0f2f4 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -82,12 +82,12 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int, # IntegerRefarray and IntegerRefPool are two complementary view types that allow # wrapping arrays with Union{Real, Missing} eltype to satisfy the DataAPI.refpool -# and DataAPI.refarray API when calling row_group_slots. +# and DataAPI.refarray API when calling row_group_slots!. # IntegerRefarray converts values to Int and replaces missing with an integer # (set by the caller to the maximum value + 1) # IntegerRefPool subtracts the minimum value - 1 and replaces back the maximum # value + 1 to missing. This ensures all values are in 1:length(refpool), while -# row_group_slots knows the number of (potential) groups via length(refpool) +# row_group_slots! knows the number of (potential) groups via length(refpool) # and is able to skip missing values when skipmissing=true struct IntegerRefarray{T<:AbstractArray} <: AbstractVector{Int} @@ -157,7 +157,7 @@ function refpool_and_array(x::AbstractArray) minval, maxval = extrema(x) end ngroups = big(maxval) - big(minval) + 1 - # Threshold chosen with the same rationale as the row_group_slots refpool method: + # Threshold chosen with the same rationale as the row_group_slots! refpool method: # refpool approach is faster but we should not allocate too much memory either # We also have to avoid overflow, including with ngroups + 1 for missing values # (note that it would be possible to allow minval and maxval to be outside of the @@ -181,11 +181,12 @@ end # 4) whether groups are already sorted # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. -function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, - hash::Val, - groups::Union{Vector{Int}, Nothing}, - skipmissing::Bool, - sort::Union{Bool, Nothing})::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} +function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, + hash::Val, + groups::Union{Vector{Int}, Nothing}, + skipmissing::Bool, + sort::Union{Bool, Nothing} + )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} rpa = refpool_and_array.(cols) if sort === false refpools = nothing @@ -194,17 +195,17 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, refpools = first.(rpa) refarrays = last.(rpa) end - row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort === true) + row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, sort === true) end # Generic fallback method based on open addressing hash table -function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, - refpools::Any, # Ignored - refarrays::Any, # Ignored - hash::Val, - groups::Union{Vector{Int}, Nothing}, - skipmissing::Bool, - sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} +function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, + refpools::Any, # Ignored + refarrays::Any, # Ignored + hash::Val, + groups::Union{Vector{Int}, Nothing}, + skipmissing::Bool, + sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} @assert groups === nothing || length(groups) == length(cols[1]) rhashes, missings = hashrows(cols, skipmissing) # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481 @@ -251,16 +252,16 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, end # Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector -function row_group_slots(cols::NTuple{N, AbstractVector}, - refpools::NTuple{N, AbstractVector}, - refarrays::NTuple{N, - Union{AbstractVector{<:Real}, - Missings.EachReplaceMissing{ - <:AbstractVector{<:Union{Real, Missing}}}}}, - hash::Val{false}, - groups::Vector{Int}, - skipmissing::Bool, - sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N +function row_group_slots!(cols::NTuple{N, AbstractVector}, + refpools::NTuple{N, AbstractVector}, + refarrays::NTuple{N, + Union{AbstractVector{<:Real}, + Missings.EachReplaceMissing{ + <:AbstractVector{<:Union{Real, Missing}}}}}, + hash::Val{false}, + groups::Vector{Int}, + skipmissing::Bool, + sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N # Computing neither hashes nor groups isn't very useful, # and this method needs to allocate a groups vector anyway @assert all(col -> length(col) == length(groups), cols) @@ -296,7 +297,7 @@ function row_group_slots(cols::NTuple{N, AbstractVector}, newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) || !(refarrays isa NTuple{<:Any, AbstractVector}) || sort ? cols : refarrays - return invoke(row_group_slots, + return invoke(row_group_slots!, Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val, Union{Vector{Int}, Nothing}, Bool, Bool}, newcols, refpools, refarrays, hash, groups, skipmissing, sort) diff --git a/test/data.jl b/test/data.jl index 3399ad35e7..b5348c5705 100644 --- a/test/data.jl +++ b/test/data.jl @@ -229,62 +229,6 @@ end @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) end -@testset "nonunique, nonunique, unique! with extra argument" begin - df1 = DataFrame(a=Union{String, Missing}["a", "b", "a", "b", "a", "b"], - b=Vector{Union{Int, Missing}}(1:6), - c=Union{Int, Missing}[1:3;1:3]) - df = vcat(df1, df1) - @test findall(nonunique(df)) == collect(7:12) - @test findall(nonunique(df, :)) == collect(7:12) - @test findall(nonunique(df, Colon())) == collect(7:12) - @test findall(nonunique(df, :a)) == collect(3:12) - @test findall(nonunique(df, "a")) == collect(3:12) - @test findall(nonunique(df, [:a, :c])) == collect(7:12) - @test findall(nonunique(df, ["a", "c"])) == collect(7:12) - @test findall(nonunique(df, r"[ac]")) == collect(7:12) - @test findall(nonunique(df, Not(2))) == collect(7:12) - @test findall(nonunique(df, Not([2]))) == collect(7:12) - @test findall(nonunique(df, Not(:b))) == collect(7:12) - @test findall(nonunique(df, Not([:b]))) == collect(7:12) - @test findall(nonunique(df, Not([false, true, false]))) == collect(7:12) - @test findall(nonunique(df, [1, 3])) == collect(7:12) - @test findall(nonunique(df, 1)) == collect(3:12) - @test findall(nonunique(df, :a => x -> 1)) == 2:12 - - @test unique(df) == df1 - @test unique(df, :) == df1 - @test unique(df, Colon()) == df1 - @test unique(df, 2:3) == df1 - @test unique(df, 3) == df1[1:3, :] - @test unique(df, [1, 3]) == df1 - @test unique(df, [:a, :c]) == df1 - @test unique(df, ["a", "c"]) == df1 - @test unique(df, r"[ac]") == df1 - @test unique(df, Not(2)) == df1 - @test unique(df, Not([2])) == df1 - @test unique(df, Not(:b)) == df1 - @test unique(df, Not([:b])) == df1 - @test unique(df, Not([false, true, false])) == df1 - @test unique(df, :a) == df1[1:2, :] - @test unique(df, "a") == df1[1:2, :] - @test unique(df, :a => x -> 1) == df[1:1, :] - @test unique(DataFrame()) == DataFrame() - @test isempty(nonunique(DataFrame())) && nonunique(DataFrame()) isa Vector{Bool} - @test_throws ArgumentError nonunique(DataFrame(a=1:3), []) - @test_throws ArgumentError unique(DataFrame(a=1:3), []) - - @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) == - df1[1:2, :] - - unique!(df, [1, 3]) - @test df == df1 - for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false])) - df = vcat(df1, df1) - unique!(df, cols) - @test df == df1 - end -end - @testset "filter() and filter!()" begin df = DataFrame(x=[3, 1, 2, 1], y=["b", "c", "a", "b"]) @test filter(r -> r[:x] > 1, df) == DataFrame(x=[3, 2], y=["b", "a"]) diff --git a/test/duplicates.jl b/test/duplicates.jl index ec85020c02..e562de54b9 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -1,6 +1,6 @@ module TestDuplicates -using Test, DataFrames, CategoricalArrays +using Test, DataFrames, CategoricalArrays, Random const ≅ = isequal @testset "nonunique" begin @@ -41,4 +41,123 @@ const ≅ = isequal @test_throws ArgumentError unique(pdf, true) end +@testset "nonunique, nonunique, unique! with extra argument" begin + df1 = DataFrame(a=Union{String, Missing}["a", "b", "a", "b", "a", "b"], + b=Vector{Union{Int, Missing}}(1:6), + c=Union{Int, Missing}[1:3;1:3]) + df = vcat(df1, df1) + @test findall(nonunique(df)) == collect(7:12) + @test findall(nonunique(df, :)) == collect(7:12) + @test findall(nonunique(df, Colon())) == collect(7:12) + @test findall(nonunique(df, :a)) == collect(3:12) + @test findall(nonunique(df, "a")) == collect(3:12) + @test findall(nonunique(df, [:a, :c])) == collect(7:12) + @test findall(nonunique(df, ["a", "c"])) == collect(7:12) + @test findall(nonunique(df, r"[ac]")) == collect(7:12) + @test findall(nonunique(df, Not(2))) == collect(7:12) + @test findall(nonunique(df, Not([2]))) == collect(7:12) + @test findall(nonunique(df, Not(:b))) == collect(7:12) + @test findall(nonunique(df, Not([:b]))) == collect(7:12) + @test findall(nonunique(df, Not([false, true, false]))) == collect(7:12) + @test findall(nonunique(df, [1, 3])) == collect(7:12) + @test findall(nonunique(df, 1)) == collect(3:12) + @test findall(nonunique(df, :a => x -> 1)) == 2:12 + + @test unique(df) == df1 + @test unique(df, :) == df1 + @test unique(df, Colon()) == df1 + @test unique(df, 2:3) == df1 + @test unique(df, 3) == df1[1:3, :] + @test unique(df, [1, 3]) == df1 + @test unique(df, [:a, :c]) == df1 + @test unique(df, ["a", "c"]) == df1 + @test unique(df, r"[ac]") == df1 + @test unique(df, Not(2)) == df1 + @test unique(df, Not([2])) == df1 + @test unique(df, Not(:b)) == df1 + @test unique(df, Not([:b])) == df1 + @test unique(df, Not([false, true, false])) == df1 + @test unique(df, :a) == df1[1:2, :] + @test unique(df, "a") == df1[1:2, :] + @test unique(df, :a => x -> 1) == df[1:1, :] + @test unique(DataFrame()) == DataFrame() + @test isempty(nonunique(DataFrame())) && nonunique(DataFrame()) isa Vector{Bool} + @test_throws ArgumentError nonunique(DataFrame(a=1:3), []) + @test_throws ArgumentError unique(DataFrame(a=1:3), []) + + @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) == + df1[1:2, :] + + unique!(df, [1, 3]) + @test df == df1 + for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false])) + df = vcat(df1, df1) + unique!(df, cols) + @test df == df1 + end +end + +@testset "keep argument to nonunique/unique/unique!" begin + df = DataFrame(a=[1, 2, 3, 1, 2, 1], + b=["a", "b", "c", "a", "b", "a"], + c=categorical(["a", "b", "c", "a", "b", "a"])) + for cols in (1, 2, 3, [1, 2], [1, 3], [2, 3], [1, 2, 3]) + @test nonunique(df, cols, keep=:first) == + [false, false, false, true, true, true] + @test nonunique(df, cols, keep=:last) == + [true, true, false, true, false, false] + @test nonunique(df, cols, keep=:only) == + [true, true, false, true, true, true] + @test nonunique(select(df, cols), keep=:first) == + [false, false, false, true, true, true] + @test nonunique(select(df, cols), keep=:last) == + [true, true, false, true, false, false] + @test nonunique(select(df, cols), keep=:only) == + [true, true, false, true, true, true] + + @test unique(df, cols, keep=:first) == + df[.![false, false, false, true, true, true], :] + @test unique(df, cols, keep=:last) == + df[.![true, true, false, true, false, false], :] + @test unique(df, cols, keep=:only) == + df[.![true, true, false, true, true, true], :] + @test unique(select(df, cols), keep=:first) == + df[.![false, false, false, true, true, true], Cols(cols)] + @test unique(select(df, cols), keep=:last) == + df[.![true, true, false, true, false, false], Cols(cols)] + @test unique(select(df, cols), keep=:only) == + df[.![true, true, false, true, true, true], Cols(cols)] + + @test unique!(copy(df), cols, keep=:first) == + df[.![false, false, false, true, true, true], :] + @test unique!(copy(df), cols, keep=:last) == + df[.![true, true, false, true, false, false], :] + @test unique!(copy(df), cols, keep=:only) == + df[.![true, true, false, true, true, true], :] + @test unique!(select(df, cols), keep=:first) == + df[.![false, false, false, true, true, true], Cols(cols)] + @test unique!(select(df, cols), keep=:last) == + df[.![true, true, false, true, false, false], Cols(cols)] + @test unique!(select(df, cols), keep=:only) == + df[.![true, true, false, true, true, true], Cols(cols)] + end + + # some larger randomized test + Random.seed!(1234) + df = DataFrame(a=rand(1:10^5, 10^5)) + df.b = string.(df.a) + df.c = categorical(df.b) + df.id = 1:10^5 + + for cols in (1, 2, 3, [1, 2], [1, 3], [2, 3], [1, 2, 3]) + @test select(unique(df, cols, keep=:first), cols, Not(cols)) == + combine(groupby(df, cols, sort=false), first) + @test select(unique(df, cols, keep=:last), cols, Not(cols)) == + sort(combine(groupby(df, cols, sort=false), last), :id) + @test select(unique(df, cols, keep=:only), cols, Not(cols)) == + sort(combine(groupby(df, cols, sort=false), + sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id) + end +end + end # module From fde2f2183e946fe736250e2169fdb003d2076ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Dec 2022 19:53:05 +0100 Subject: [PATCH 02/12] improve tests and fix docs --- src/abstractdataframe/unique.jl | 2 +- test/duplicates.jl | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 695e58570c..aba2c2ba2b 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -323,7 +323,7 @@ julia> df = vcat(df, df) 7 │ 3 1 8 │ 4 2 -julia> unique!(df) # modifies df +julia> unique!(copy(df)) # modifies df 4×2 DataFrame Row │ i x │ Int64 Int64 diff --git a/test/duplicates.jl b/test/duplicates.jl index e562de54b9..fa3b246069 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -158,6 +158,13 @@ end sort(combine(groupby(df, cols, sort=false), sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id) end + + @test isempty(nonunique(DataFrame(), keep=:first)) + @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[]) + @test unique!(DataFrame(), keep=:only) == DataFrame() + @test_throws ArgumentError nonunique(DataFrame(), keep=:a) + @test_throws ArgumentError unique(DataFrame(), keep=:b) + @test_throws ArgumentError unique!(DataFrame(), keep=:c) end end # module From b6662c1a0465a3efc8a5a25f72ec667427399914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 1 Jan 2023 18:56:45 +0100 Subject: [PATCH 03/12] improve performance --- src/abstractdataframe/unique.jl | 83 +++++++++++++++++++++++++-------- src/groupeddataframe/utils.jl | 1 + 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index aba2c2ba2b..45836ca51c 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -89,33 +89,76 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) end ncol(df) == 0 && return Bool[] res = fill(true, nrow(df)) + cols = ntuple(i -> df[!, i], ncol(df)) if keep == :first - gslots = row_group_slots!(ntuple(i -> df[!, i], ncol(df)), Val(false), - nothing, false, nothing)[3] - # unique rows are the first encountered group representatives, - # nonunique are everything else - @inbounds for g_row in gslots - (g_row > 0) && (res[g_row] = false) + # if we can take advantage of references pass groups to avoid generating hashes + rpa = refpool_and_array.(cols) + refpools = first.(rpa) + refarrays = last.(rpa) + if isnothing(refpools) || isnothing(refarrays) + ngroups, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, + false, nothing) + # unique rows are the first encountered group representatives, + # nonunique are everything else + cseen = 0 + @inbounds for g_row in gslots + if g_row > 0 + res[g_row] = false + # this check slows down the process when all rows are unique + # but speeds up when we have duplicates + cseen += 1 + cseen == ngroups && break + end + end + else + groups = Vector{Int}(undef, nrow(df)) + ngroups = row_group_slots!(cols, refpools, refarrays, + Val(false), groups, false, false)[1] + seen = fill(false, ngroups) + cseen = 0 + for i in 1:nrow(df) + g = groups[i] + if !seen[g] + seen[g] = true + res[i] = false + cseen += 1 + cseen == ngroups && break + end + end end - return res else - # TODO: this can be potentially optimized in the future, - # but the use of this code is expected to be rare - # so currently a simple implementation is provided - # that is already visibly faster than using groupby and combine - gdf = groupby(df, All()) - idx = gdf.idx - @assert length(gdf.starts) == length(gdf.ends) + groups = Vector{Int}(undef, nrow(df)) + ngroups = row_group_slots!(cols, Val(false), groups, false, nothing)[1] if keep == :last - for (s, e) in zip(gdf.starts, gdf.ends) - # keep last index in a group - res[idx[e]] = false + seen = fill(false, ngroups) + cseen = 0 + for i in nrow(df):-1:1 + g = groups[i] + if !seen[g] + seen[g] = true + res[i] = false + cseen += 1 + cseen == ngroups && break + end end else @assert keep == :only - for (s, e) in zip(gdf.starts, gdf.ends) - # set to false if s == e - res[idx[e]] = s != e + # -1 indicates that we have not seen the group yet + # positive value indicates the first position we have seen the group + # 0 indicates that we have seen the group at least twice + firstseen = fill(-1, ngroups) + for i in 1:nrow(df) + g = groups[i] + j = firstseen[g] + if j == -1 + # this is possibly non duplicate row + firstseen[g] = i + res[i] = false + elseif j > 0 + # the row had duplicate + res[j] = true + firstseen[g] = 0 + end end end end diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index a173c0f2f4..ae7a8e4013 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -178,6 +178,7 @@ end # 2) vector of row hashes (may be empty if hash=Val(false)) # 3) slot array for a hash map, non-zero values are # the indices of the first row in a group +# (returned only if hashes are generated) # 4) whether groups are already sorted # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. From 7409ba024183b33a5fcbf731a650d824b59b0393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 1 Jan 2023 22:14:56 +0100 Subject: [PATCH 04/12] fix condition --- src/abstractdataframe/unique.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 45836ca51c..7cc9ca45f9 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -95,7 +95,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) rpa = refpool_and_array.(cols) refpools = first.(rpa) refarrays = last.(rpa) - if isnothing(refpools) || isnothing(refarrays) + if any(isnothing, refpools) || any(isnothing, refarrays) ngroups, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, false, nothing) # unique rows are the first encountered group representatives, From e6a9f8e01983539a55cb8d51f4153b48c7a29099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 9 Jan 2023 09:51:26 +0100 Subject: [PATCH 05/12] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/unique.jl | 36 +++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 7cc9ca45f9..1b4bb9b9eb 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -151,11 +151,11 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) g = groups[i] j = firstseen[g] if j == -1 - # this is possibly non duplicate row + # this is possibly a non duplicate row firstseen[g] = i res[i] = false elseif j > 0 - # the row had duplicate + # the row had a duplicate res[j] = true firstseen[g] = 0 end @@ -170,16 +170,16 @@ function nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first) if ncol(df) > 0 && ncol(udf) == 0 throw(ArgumentError("finding duplicate rows in data frame when " * "`cols` selects no columns is not allowed")) - else - return nonunique(udf, keep=keep) end + return nonunique(udf, keep=keep) end """ allunique(df::AbstractDataFrame, cols=:) -Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if -all their columns contain equal values (according to `isequal`). +Return `true` if none of the rows of `df` are duplicated. Two rows are duplicates if +all their columns contain equal values (according to `isequal`) +for all columns in `cols` (by default, all columns). See also [`unique`](@ref) and [`nonunique`](@ref). @@ -222,27 +222,23 @@ end unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first) unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first) -If `keep=:first` (the default) return a data frame containing only the first -occurrence of unique rows in `df`. - -If `keep=:last` return a data frame containing only the last occurrence of -unique rows in `df`. - -If `keep=:only` return a data frame containing only rows that are unique in `df` -(in case of duplicate rows all are dropped). +Return a data frame containing only unique rows in `df`. -When `cols` is specified, the returned `DataFrame` contains complete rows, -retaining in each case the first occurrence of a given combination of values -in selected columns or their transformations. `cols` can be any column -selector or transformation accepted by [`select`](@ref). +Non-unique (duplicate) rows are those for which at least another row contains equal values +(according to `isequal`) for all columns in `cols` (by default, all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is kept. +If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. +If `keep=:only`, only rows without any duplicates are kept. If `view=false` a freshly allocated `DataFrame` is returned, and if `view=true` then a `SubDataFrame` view into `df` is returned. # Arguments - `df` : the AbstractDataFrame -- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) -specifying the column(s) to compare. +- `cols` : a selector specifying the column(s) or their transformations to + compare. Can be any column selector or transformation accepted by + [`select`](@ref) that returns at least one column if `df` has at least one + column. $METADATA_FIXED From 570a80bc926f9ad16bfafe5fc00291ad64d6fe6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 9 Jan 2023 10:06:46 +0100 Subject: [PATCH 06/12] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/unique.jl | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 1b4bb9b9eb..9ff7cab4f7 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -4,14 +4,13 @@ Return a `Vector{Bool}` in which `true` entries indicate duplicate rows. -If `keep=:first` (the default) a row is a duplicate if there exists a prior -row with all columns containing equal values (according to `isequal`). - -If `keep=:last` a row is a duplicate if there exists a subsequent row with all -columns containing equal values (according to `isequal`). - -If `keep=:only` a row is a duplicate if there exists any other row with all -columns containing equal values (according to `isequal`). +Duplicate rows are those for which at least another row contains equal values +(according to `isequal`) for all columns in `cols` (by default, all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate rows +is indicated with a `false` entry. +If `keep=:last`, only the last occurrence of a set of duplicate rows +is indicated with a `false` entry. +If `keep=:only`, only rows without any duplicates are indicated with a `false` entry. See also [`unique`](@ref) and [`unique!`](@ref). From 0484a135edde8c4699629c7723927e1958c7f08b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 9 Jan 2023 12:22:53 +0100 Subject: [PATCH 07/12] changes after code review --- src/abstractdataframe/abstractdataframe.jl | 3 +- src/abstractdataframe/unique.jl | 94 ++++++++++------------ src/groupeddataframe/groupeddataframe.jl | 3 +- src/groupeddataframe/utils.jl | 25 ++++-- test/duplicates.jl | 24 ++++-- 5 files changed, 80 insertions(+), 69 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 157cf4bf17..29262019f3 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1404,8 +1404,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols; "must be specified")) end + # we use hashing algorithm here, because we assume that the tables we work with are not huge has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)), - Val(false), nothing, false, nothing)[1] != nrow(df) + Val(false), nothing, false, nothing, false)[1] != nrow(df) if has_duplicates && !allowduplicates throw(ArgumentError("duplicate combinations of `indexcols` are not " * "allowed in input when `allowduplicates=false`")) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 9ff7cab4f7..705af10a04 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -6,11 +6,12 @@ Return a `Vector{Bool}` in which `true` entries indicate duplicate rows. Duplicate rows are those for which at least another row contains equal values (according to `isequal`) for all columns in `cols` (by default, all columns). -If `keep=:first` (the default), only the first occurrence of a set of duplicate rows -is indicated with a `false` entry. -If `keep=:last`, only the last occurrence of a set of duplicate rows -is indicated with a `false` entry. -If `keep=:only`, only rows without any duplicates are indicated with a `false` entry. +If `keep=:first` (the default), only the first occurrence of a set of duplicate +rows is indicated with a `false` entry. +If `keep=:last`, only the last occurrence of a set of duplicate rows is +indicated with a `false` entry. +If `keep=:nonduplicates`, only rows without any duplicates are indicated with a +`false` entry. See also [`unique`](@ref) and [`unique!`](@ref). @@ -83,8 +84,8 @@ julia> nonunique(df, 2) ``` """ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) - if !(keep in (:first, :last, :only)) - throw(ArgumentError("`keep` must be :first, :last, or :none")) + if !(keep in (:first, :last, :nonduplicates)) + throw(ArgumentError("`keep` must be :first, :last, or :nonduplicates")) end ncol(df) == 0 && return Bool[] res = fill(true, nrow(df)) @@ -95,53 +96,40 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) refpools = first.(rpa) refarrays = last.(rpa) if any(isnothing, refpools) || any(isnothing, refarrays) - ngroups, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, - false, nothing) + _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, + false, nothing, true) # unique rows are the first encountered group representatives, # nonunique are everything else - cseen = 0 @inbounds for g_row in gslots - if g_row > 0 - res[g_row] = false - # this check slows down the process when all rows are unique - # but speeds up when we have duplicates - cseen += 1 - cseen == ngroups && break - end + g_row > 0 && (res[g_row] = false) end else groups = Vector{Int}(undef, nrow(df)) ngroups = row_group_slots!(cols, refpools, refarrays, - Val(false), groups, false, false)[1] + Val(false), groups, false, false, true)[1] seen = fill(false, ngroups) - cseen = 0 for i in 1:nrow(df) g = groups[i] if !seen[g] seen[g] = true res[i] = false - cseen += 1 - cseen == ngroups && break end end end else groups = Vector{Int}(undef, nrow(df)) - ngroups = row_group_slots!(cols, Val(false), groups, false, nothing)[1] + ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, true)[1] if keep == :last seen = fill(false, ngroups) - cseen = 0 for i in nrow(df):-1:1 g = groups[i] if !seen[g] seen[g] = true res[i] = false - cseen += 1 - cseen == ngroups && break end end else - @assert keep == :only + @assert keep == :nonduplicates # -1 indicates that we have not seen the group yet # positive value indicates the first position we have seen the group # 0 indicates that we have seen the group at least twice @@ -176,16 +164,17 @@ end """ allunique(df::AbstractDataFrame, cols=:) -Return `true` if none of the rows of `df` are duplicated. Two rows are duplicates if -all their columns contain equal values (according to `isequal`) +Return `true` if none of the rows of `df` are duplicated. Two rows are +duplicates if all their columns contain equal values (according to `isequal`) for all columns in `cols` (by default, all columns). See also [`unique`](@ref) and [`nonunique`](@ref). # Arguments - `df` : `AbstractDataFrame` -- `cols` : a selector specifying the column(s) or their transformations to compare. - Can be any column selector or transformation accepted by [`select`](@ref). +- `cols` : a selector specifying the column(s) or their transformations to + compare. Can be any column selector or transformation accepted by + [`select`](@ref). # Examples @@ -214,7 +203,7 @@ function Base.allunique(df::AbstractDataFrame, cols=:) udf = _try_select_no_copy(df, cols) nrow(udf) == 0 && return true return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)), - Val(false), nothing, false, nothing)[1] == nrow(df) + Val(false), nothing, false, nothing, false)[1] == nrow(df) end """ @@ -223,14 +212,16 @@ end Return a data frame containing only unique rows in `df`. -Non-unique (duplicate) rows are those for which at least another row contains equal values -(according to `isequal`) for all columns in `cols` (by default, all columns). -If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is kept. +Non-unique (duplicate) rows are those for which at least another row contains +equal values (according to `isequal`) for all columns in `cols` (by default, +all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate +rows is kept. If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. -If `keep=:only`, only rows without any duplicates are kept. +If `keep=:nonduplicates`, only rows without any duplicates are kept. -If `view=false` a freshly allocated `DataFrame` is returned, -and if `view=true` then a `SubDataFrame` view into `df` is returned. +If `view=false` a freshly allocated `DataFrame` is returned, and if `view=true` +then a `SubDataFrame` view into `df` is returned. # Arguments - `df` : the AbstractDataFrame @@ -288,7 +279,7 @@ julia> unique(df, 2) 1 │ 1 1 2 │ 2 2 -julia> unique(df, keep=:only) +julia> unique(df, keep=:nonduplicates) 0×2 DataFrame Row │ i x │ Int64 Int64 @@ -311,24 +302,23 @@ end unique!(df::AbstractDataFrame; keep::Symbol=:first) unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) -If `keep=:first` (the default) update `df` in place to contain only the first -occurrence of unique rows in `df`. +Update `df` in-place to containi only unique rows. -If `keep=:last` update `df` in place to contain only the last occurrence of -unique rows in `df`. - -If `keep=:only` update `df` in place to contain only rows that are unique in `df` -(in case of duplicate rows all are dropped). - -When `cols` is specified, the returned `DataFrame` contains complete rows, -retaining in each case the first occurrence of a given combination of values -in selected columns or their transformations. `cols` can be any column -selector or transformation accepted by [`select`](@ref). +Non-unique (duplicate) rows are those for which at least another row contains +equal values (according to `isequal`) for all columns in `cols` (by default, +all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate +rows is kept. +If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. +If `keep=:nonduplicates`, only rows without any duplicates are kept. # Arguments - `df` : the AbstractDataFrame - `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) -specifying the column(s) to compare. + specifying the column(s) to compare. Can be any column selector or + transformation accepted by [`select`](@ref) that returns at least one column + if `df` has at least one column. + $METADATA_FIXED @@ -371,7 +361,7 @@ julia> unique!(copy(df)) # modifies df 3 │ 3 1 4 │ 4 2 -julia> unique(df, keep=:only) +julia> unique(df, keep=:nonduplicates) 0×2 DataFrame Row │ i x │ Int64 Int64 diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index 6417c1d68c..b0bc59046e 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -249,7 +249,8 @@ function groupby(df::AbstractDataFrame, cols; groups = Vector{Int}(undef, nrow(df)) ngroups, rhashes, gslots, sorted = row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), - groups, skipmissing, sort isa NamedTuple ? nothing : sort) + groups, skipmissing, + sort isa NamedTuple ? nothing : sort, false) gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing, Threads.ReentrantLock()) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index ae7a8e4013..47b777ac69 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -182,11 +182,17 @@ end # 4) whether groups are already sorted # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. +# +# Also the last argument is nonunique. If it is `true` then groups are not +# compressed to form a continuous sequence. Normally `false` should be passed +# as this ensures that returned `ngroups` indeed indicates the number of groups +# but in `nonunique` we do not use this information so compressing can be skipped function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, hash::Val, groups::Union{Vector{Int}, Nothing}, skipmissing::Bool, - sort::Union{Bool, Nothing} + sort::Union{Bool, Nothing}, + nonunique::Bool )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} rpa = refpool_and_array.(cols) if sort === false @@ -196,7 +202,8 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, refpools = first.(rpa) refarrays = last.(rpa) end - row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, sort === true) + row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, + sort === true, nonunique) end # Generic fallback method based on open addressing hash table @@ -206,7 +213,8 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, hash::Val, groups::Union{Vector{Int}, Nothing}, skipmissing::Bool, - sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} + sort::Bool, + nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} @assert groups === nothing || length(groups) == length(cols[1]) rhashes, missings = hashrows(cols, skipmissing) # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481 @@ -262,7 +270,8 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, hash::Val{false}, groups::Vector{Int}, skipmissing::Bool, - sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N + sort::Bool, + nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N # Computing neither hashes nor groups isn't very useful, # and this method needs to allocate a groups vector anyway @assert all(col -> length(col) == length(groups), cols) @@ -300,8 +309,8 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, sort ? cols : refarrays return invoke(row_group_slots!, Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val, - Union{Vector{Int}, Nothing}, Bool, Bool}, - newcols, refpools, refarrays, hash, groups, skipmissing, sort) + Union{Vector{Int}, Nothing}, Bool, Bool, Bool}, + newcols, refpools, refarrays, hash, groups, skipmissing, sort, nonunique) end strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N, Int} @@ -430,7 +439,9 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, # If some groups are unused, compress group indices to drop them # sum(seen) is faster than all(seen) when not short-circuiting, # and short-circuit would only happen in the slower case anyway - if sum(seen) < length(seen) + # + # This process is not needed if row_group_slots! is called from nonunique + if !nonunique && sum(seen) < length(seen) oldngroups = ngroups remap = zeros(Int, ngroups) ngroups = 0 diff --git a/test/duplicates.jl b/test/duplicates.jl index fa3b246069..9fcd7c546e 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -106,39 +106,39 @@ end [false, false, false, true, true, true] @test nonunique(df, cols, keep=:last) == [true, true, false, true, false, false] - @test nonunique(df, cols, keep=:only) == + @test nonunique(df, cols, keep=:nonduplicates) == [true, true, false, true, true, true] @test nonunique(select(df, cols), keep=:first) == [false, false, false, true, true, true] @test nonunique(select(df, cols), keep=:last) == [true, true, false, true, false, false] - @test nonunique(select(df, cols), keep=:only) == + @test nonunique(select(df, cols), keep=:nonduplicates) == [true, true, false, true, true, true] @test unique(df, cols, keep=:first) == df[.![false, false, false, true, true, true], :] @test unique(df, cols, keep=:last) == df[.![true, true, false, true, false, false], :] - @test unique(df, cols, keep=:only) == + @test unique(df, cols, keep=:nonduplicates) == df[.![true, true, false, true, true, true], :] @test unique(select(df, cols), keep=:first) == df[.![false, false, false, true, true, true], Cols(cols)] @test unique(select(df, cols), keep=:last) == df[.![true, true, false, true, false, false], Cols(cols)] - @test unique(select(df, cols), keep=:only) == + @test unique(select(df, cols), keep=:nonduplicates) == df[.![true, true, false, true, true, true], Cols(cols)] @test unique!(copy(df), cols, keep=:first) == df[.![false, false, false, true, true, true], :] @test unique!(copy(df), cols, keep=:last) == df[.![true, true, false, true, false, false], :] - @test unique!(copy(df), cols, keep=:only) == + @test unique!(copy(df), cols, keep=:nonduplicates) == df[.![true, true, false, true, true, true], :] @test unique!(select(df, cols), keep=:first) == df[.![false, false, false, true, true, true], Cols(cols)] @test unique!(select(df, cols), keep=:last) == df[.![true, true, false, true, false, false], Cols(cols)] - @test unique!(select(df, cols), keep=:only) == + @test unique!(select(df, cols), keep=:nonduplicates) == df[.![true, true, false, true, true, true], Cols(cols)] end @@ -154,17 +154,25 @@ end combine(groupby(df, cols, sort=false), first) @test select(unique(df, cols, keep=:last), cols, Not(cols)) == sort(combine(groupby(df, cols, sort=false), last), :id) - @test select(unique(df, cols, keep=:only), cols, Not(cols)) == + @test select(unique(df, cols, keep=:nonduplicates), cols, Not(cols)) == sort(combine(groupby(df, cols, sort=false), sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id) end @test isempty(nonunique(DataFrame(), keep=:first)) @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[]) - @test unique!(DataFrame(), keep=:only) == DataFrame() + @test unique!(DataFrame(), keep=:nonduplicates) == DataFrame() @test_throws ArgumentError nonunique(DataFrame(), keep=:a) @test_throws ArgumentError unique(DataFrame(), keep=:b) @test_throws ArgumentError unique!(DataFrame(), keep=:c) end +@testset "case when groups are not compressed in row_group_slots!" begin + df = DataFrame(x=repeat([1:1000; -1], 2)); + @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [-1; 1:1000] + @test nonunique(df, :x) == [falses(1001); trues(1001)] + @test nonunique(df, :x, keep=:last) == [trues(1001); falses(1001)] + @test all(nonunique(df, :x, keep=:nonduplicates)) +end + end # module From 1caa657a78c223037e69da5fcfeb0ec1087adbaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 9 Jan 2023 15:03:26 +0100 Subject: [PATCH 08/12] small fixes --- src/abstractdataframe/unique.jl | 19 +++++++++---------- test/duplicates.jl | 18 +++++++++--------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 705af10a04..f7fc1daeb3 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -13,8 +13,6 @@ indicated with a `false` entry. If `keep=:nonduplicates`, only rows without any duplicates are indicated with a `false` entry. -See also [`unique`](@ref) and [`unique!`](@ref). - # Arguments - `df` : `AbstractDataFrame` - `cols` : a selector specifying the column(s) or their transformations to @@ -22,6 +20,8 @@ See also [`unique`](@ref) and [`unique!`](@ref). [`select`](@ref) that returns at least one column if `df` has at least one column. +See also [`unique`](@ref) and [`unique!`](@ref). + # Examples ```jldoctest @@ -97,7 +97,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) refarrays = last.(rpa) if any(isnothing, refpools) || any(isnothing, refarrays) _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, - false, nothing, true) + false, nothing, true) # unique rows are the first encountered group representatives, # nonunique are everything else @inbounds for g_row in gslots @@ -168,14 +168,14 @@ Return `true` if none of the rows of `df` are duplicated. Two rows are duplicates if all their columns contain equal values (according to `isequal`) for all columns in `cols` (by default, all columns). -See also [`unique`](@ref) and [`nonunique`](@ref). - # Arguments - `df` : `AbstractDataFrame` - `cols` : a selector specifying the column(s) or their transformations to compare. Can be any column selector or transformation accepted by [`select`](@ref). +See also [`unique`](@ref) and [`nonunique`](@ref). + # Examples ```jldoctest @@ -218,7 +218,7 @@ all columns). If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is kept. If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. -If `keep=:nonduplicates`, only rows without any duplicates are kept. +If `keep=:noduplicates`, only rows without any duplicates are kept. If `view=false` a freshly allocated `DataFrame` is returned, and if `view=true` then a `SubDataFrame` view into `df` is returned. @@ -279,7 +279,7 @@ julia> unique(df, 2) 1 │ 1 1 2 │ 2 2 -julia> unique(df, keep=:nonduplicates) +julia> unique(df, keep=:noduplicates) 0×2 DataFrame Row │ i x │ Int64 Int64 @@ -310,7 +310,7 @@ all columns). If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is kept. If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. -If `keep=:nonduplicates`, only rows without any duplicates are kept. +If `keep=:noduplicates`, only rows without any duplicates are kept. # Arguments - `df` : the AbstractDataFrame @@ -319,7 +319,6 @@ If `keep=:nonduplicates`, only rows without any duplicates are kept. transformation accepted by [`select`](@ref) that returns at least one column if `df` has at least one column. - $METADATA_FIXED See also: [`unique!`](@ref), [`nonunique`](@ref). @@ -361,7 +360,7 @@ julia> unique!(copy(df)) # modifies df 3 │ 3 1 4 │ 4 2 -julia> unique(df, keep=:nonduplicates) +julia> unique(df, keep=:noduplicates) 0×2 DataFrame Row │ i x │ Int64 Int64 diff --git a/test/duplicates.jl b/test/duplicates.jl index 9fcd7c546e..61c01874d2 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -106,39 +106,39 @@ end [false, false, false, true, true, true] @test nonunique(df, cols, keep=:last) == [true, true, false, true, false, false] - @test nonunique(df, cols, keep=:nonduplicates) == + @test nonunique(df, cols, keep=:noduplicates) == [true, true, false, true, true, true] @test nonunique(select(df, cols), keep=:first) == [false, false, false, true, true, true] @test nonunique(select(df, cols), keep=:last) == [true, true, false, true, false, false] - @test nonunique(select(df, cols), keep=:nonduplicates) == + @test nonunique(select(df, cols), keep=:noduplicates) == [true, true, false, true, true, true] @test unique(df, cols, keep=:first) == df[.![false, false, false, true, true, true], :] @test unique(df, cols, keep=:last) == df[.![true, true, false, true, false, false], :] - @test unique(df, cols, keep=:nonduplicates) == + @test unique(df, cols, keep=:noduplicates) == df[.![true, true, false, true, true, true], :] @test unique(select(df, cols), keep=:first) == df[.![false, false, false, true, true, true], Cols(cols)] @test unique(select(df, cols), keep=:last) == df[.![true, true, false, true, false, false], Cols(cols)] - @test unique(select(df, cols), keep=:nonduplicates) == + @test unique(select(df, cols), keep=:noduplicates) == df[.![true, true, false, true, true, true], Cols(cols)] @test unique!(copy(df), cols, keep=:first) == df[.![false, false, false, true, true, true], :] @test unique!(copy(df), cols, keep=:last) == df[.![true, true, false, true, false, false], :] - @test unique!(copy(df), cols, keep=:nonduplicates) == + @test unique!(copy(df), cols, keep=:noduplicates) == df[.![true, true, false, true, true, true], :] @test unique!(select(df, cols), keep=:first) == df[.![false, false, false, true, true, true], Cols(cols)] @test unique!(select(df, cols), keep=:last) == df[.![true, true, false, true, false, false], Cols(cols)] - @test unique!(select(df, cols), keep=:nonduplicates) == + @test unique!(select(df, cols), keep=:noduplicates) == df[.![true, true, false, true, true, true], Cols(cols)] end @@ -154,14 +154,14 @@ end combine(groupby(df, cols, sort=false), first) @test select(unique(df, cols, keep=:last), cols, Not(cols)) == sort(combine(groupby(df, cols, sort=false), last), :id) - @test select(unique(df, cols, keep=:nonduplicates), cols, Not(cols)) == + @test select(unique(df, cols, keep=:noduplicates), cols, Not(cols)) == sort(combine(groupby(df, cols, sort=false), sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id) end @test isempty(nonunique(DataFrame(), keep=:first)) @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[]) - @test unique!(DataFrame(), keep=:nonduplicates) == DataFrame() + @test unique!(DataFrame(), keep=:noduplicates) == DataFrame() @test_throws ArgumentError nonunique(DataFrame(), keep=:a) @test_throws ArgumentError unique(DataFrame(), keep=:b) @test_throws ArgumentError unique!(DataFrame(), keep=:c) @@ -172,7 +172,7 @@ end @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [-1; 1:1000] @test nonunique(df, :x) == [falses(1001); trues(1001)] @test nonunique(df, :x, keep=:last) == [trues(1001); falses(1001)] - @test all(nonunique(df, :x, keep=:nonduplicates)) + @test all(nonunique(df, :x, keep=:noduplicates)) end end # module From c48a96c31eedfd74a04890878756411a557ebe6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 9 Jan 2023 18:10:43 +0100 Subject: [PATCH 09/12] fix typo --- src/abstractdataframe/unique.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index f7fc1daeb3..cc2a8d1f1f 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -10,7 +10,7 @@ If `keep=:first` (the default), only the first occurrence of a set of duplicate rows is indicated with a `false` entry. If `keep=:last`, only the last occurrence of a set of duplicate rows is indicated with a `false` entry. -If `keep=:nonduplicates`, only rows without any duplicates are indicated with a +If `keep=:noduplicates`, only rows without any duplicates are indicated with a `false` entry. # Arguments @@ -84,8 +84,8 @@ julia> nonunique(df, 2) ``` """ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) - if !(keep in (:first, :last, :nonduplicates)) - throw(ArgumentError("`keep` must be :first, :last, or :nonduplicates")) + if !(keep in (:first, :last, :noduplicates)) + throw(ArgumentError("`keep` must be :first, :last, or :noduplicates")) end ncol(df) == 0 && return Bool[] res = fill(true, nrow(df)) @@ -129,7 +129,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) end end else - @assert keep == :nonduplicates + @assert keep == :noduplicates # -1 indicates that we have not seen the group yet # positive value indicates the first position we have seen the group # 0 indicates that we have seen the group at least twice From 6212b45ba4d41001e7d716652d5fb0b51068c2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 26 Jan 2023 16:11:53 +0100 Subject: [PATCH 10/12] change last argument name of row_group_slots! --- src/abstractdataframe/abstractdataframe.jl | 3 +-- src/abstractdataframe/unique.jl | 13 ++++++------- src/groupeddataframe/groupeddataframe.jl | 2 +- src/groupeddataframe/utils.jl | 21 +++++++++++---------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 29262019f3..1056ed665b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1406,7 +1406,7 @@ function fillcombinations(df::AbstractDataFrame, indexcols; # we use hashing algorithm here, because we assume that the tables we work with are not huge has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)), - Val(false), nothing, false, nothing, false)[1] != nrow(df) + Val(false), nothing, false, nothing, true)[1] != nrow(df) if has_duplicates && !allowduplicates throw(ArgumentError("duplicate combinations of `indexcols` are not " * "allowed in input when `allowduplicates=false`")) @@ -3131,4 +3131,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta r = min(state + itr.n - 1, last_idx) return view(itr.c, state:r, :), r + 1 end - diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index cc2a8d1f1f..19cec8be7d 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -97,7 +97,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) refarrays = last.(rpa) if any(isnothing, refpools) || any(isnothing, refarrays) _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, - false, nothing, true) + false, nothing, false) # unique rows are the first encountered group representatives, # nonunique are everything else @inbounds for g_row in gslots @@ -106,7 +106,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) else groups = Vector{Int}(undef, nrow(df)) ngroups = row_group_slots!(cols, refpools, refarrays, - Val(false), groups, false, false, true)[1] + Val(false), groups, false, false, false)[1] seen = fill(false, ngroups) for i in 1:nrow(df) g = groups[i] @@ -118,7 +118,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) end else groups = Vector{Int}(undef, nrow(df)) - ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, true)[1] + ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, false)[1] if keep == :last seen = fill(false, ngroups) for i in nrow(df):-1:1 @@ -203,7 +203,7 @@ function Base.allunique(df::AbstractDataFrame, cols=:) udf = _try_select_no_copy(df, cols) nrow(udf) == 0 && return true return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)), - Val(false), nothing, false, nothing, false)[1] == nrow(df) + Val(false), nothing, false, nothing, true)[1] == nrow(df) end """ @@ -281,7 +281,7 @@ julia> unique(df, 2) julia> unique(df, keep=:noduplicates) 0×2 DataFrame - Row │ i x + Row │ i x │ Int64 Int64 ─────┴────────────── ``` @@ -362,7 +362,7 @@ julia> unique!(copy(df)) # modifies df julia> unique(df, keep=:noduplicates) 0×2 DataFrame - Row │ i x + Row │ i x │ Int64 Int64 ─────┴────────────── ``` @@ -373,4 +373,3 @@ Base.unique!(df::AbstractDataFrame, cols::AbstractVector; keep::Symbol=:first) = deleteat!(df, _findall(nonunique(df, cols, keep=keep))) Base.unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) = deleteat!(df, _findall(nonunique(df, cols, keep=keep))) - diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index b0bc59046e..d08bef7f55 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -250,7 +250,7 @@ function groupby(df::AbstractDataFrame, cols; ngroups, rhashes, gslots, sorted = row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), groups, skipmissing, - sort isa NamedTuple ? nothing : sort, false) + sort isa NamedTuple ? nothing : sort, true) gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing, Threads.ReentrantLock()) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 47b777ac69..ade793042c 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -183,16 +183,17 @@ end # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. # -# Also the last argument is nonunique. If it is `true` then groups are not -# compressed to form a continuous sequence. Normally `false` should be passed +# Also the last argument is `compress`. If it is `false` then groups are not +# compressed to form a continuous sequence. Normally `true` should be passed # as this ensures that returned `ngroups` indeed indicates the number of groups -# but in `nonunique` we do not use this information so compressing can be skipped +# but e.g. in `nonunique` we do not use this information so compressing +# can be skipped by passing `compress=false` function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, hash::Val, groups::Union{Vector{Int}, Nothing}, skipmissing::Bool, sort::Union{Bool, Nothing}, - nonunique::Bool + compress::Bool )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} rpa = refpool_and_array.(cols) if sort === false @@ -203,7 +204,7 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, refarrays = last.(rpa) end row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, - sort === true, nonunique) + sort === true, compress) end # Generic fallback method based on open addressing hash table @@ -214,7 +215,7 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, groups::Union{Vector{Int}, Nothing}, skipmissing::Bool, sort::Bool, - nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} + compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} @assert groups === nothing || length(groups) == length(cols[1]) rhashes, missings = hashrows(cols, skipmissing) # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481 @@ -271,7 +272,7 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, groups::Vector{Int}, skipmissing::Bool, sort::Bool, - nonunique::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N + compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N # Computing neither hashes nor groups isn't very useful, # and this method needs to allocate a groups vector anyway @assert all(col -> length(col) == length(groups), cols) @@ -310,7 +311,7 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, return invoke(row_group_slots!, Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val, Union{Vector{Int}, Nothing}, Bool, Bool, Bool}, - newcols, refpools, refarrays, hash, groups, skipmissing, sort, nonunique) + newcols, refpools, refarrays, hash, groups, skipmissing, sort, compress) end strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N, Int} @@ -440,8 +441,8 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, # sum(seen) is faster than all(seen) when not short-circuiting, # and short-circuit would only happen in the slower case anyway # - # This process is not needed if row_group_slots! is called from nonunique - if !nonunique && sum(seen) < length(seen) + # This process is not needed if row_group_slots! is called with compress=false + if compress && sum(seen) < length(seen) oldngroups = ngroups remap = zeros(Int, ngroups) ngroups = 0 From 1bd00b9a8167d9635765ad4e59478e785f271c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 28 Jan 2023 10:49:23 +0100 Subject: [PATCH 11/12] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/unique.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 19cec8be7d..03cddfe74d 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -91,10 +91,10 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) res = fill(true, nrow(df)) cols = ntuple(i -> df[!, i], ncol(df)) if keep == :first - # if we can take advantage of references pass groups to avoid generating hashes rpa = refpool_and_array.(cols) refpools = first.(rpa) refarrays = last.(rpa) + # if refarray cannot be used, we can avoid allocating a groups vector if any(isnothing, refpools) || any(isnothing, refarrays) _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, false, nothing, false) @@ -103,7 +103,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) @inbounds for g_row in gslots g_row > 0 && (res[g_row] = false) end - else + else # faster refarray method but allocates a groups vector groups = Vector{Int}(undef, nrow(df)) ngroups = row_group_slots!(cols, refpools, refarrays, Val(false), groups, false, false, false)[1] @@ -117,6 +117,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) end end else + # always allocate a group vector, use refarray automatically if possible groups = Vector{Int}(undef, nrow(df)) ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, false)[1] if keep == :last From 0f46347f04d50aa650e84281e71adf35c7734184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 28 Jan 2023 14:15:41 +0100 Subject: [PATCH 12/12] Update src/groupeddataframe/utils.jl --- src/groupeddataframe/utils.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index ade793042c..d8acb7983a 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -193,8 +193,7 @@ function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, groups::Union{Vector{Int}, Nothing}, skipmissing::Bool, sort::Union{Bool, Nothing}, - compress::Bool - )::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} + compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} rpa = refpool_and_array.(cols) if sort === false refpools = nothing