diff --git a/NEWS.md b/NEWS.md index da12048624..39aee15a8f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ * Joining functions now support `order` keyword argument allowing the user to specify the order of the rows in the produced table ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233)) +* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!` + allowing to specify which duplicate rows should be kept + ([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260)) ## Bug fixes diff --git a/src/DataFrames.jl b/src/DataFrames.jl index c5d8366214..a2a652154a 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -134,6 +134,7 @@ include("other/utils.jl") include("other/index.jl") include("abstractdataframe/abstractdataframe.jl") +include("abstractdataframe/unique.jl") include("dataframe/dataframe.jl") include("subdataframe/subdataframe.jl") include("dataframerow/dataframerow.jl") diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 9fba690d49..1056ed665b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1342,278 +1342,6 @@ end Base.Array(df::AbstractDataFrame) = Matrix(df) Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df) -""" - nonunique(df::AbstractDataFrame) - nonunique(df::AbstractDataFrame, cols) - -Return a `Vector{Bool}` in which `true` entries indicate duplicate rows. -A row is a duplicate if there exists a prior row with all columns containing -equal values (according to `isequal`). - -See also [`unique`](@ref) and [`unique!`](@ref). - -# Arguments -- `df` : `AbstractDataFrame` -- `cols` : a selector specifying the column(s) or their transformations to compare. - Can be any column selector or transformation accepted by [`select`](@ref) that - returns at least one column if `df` has at least one column. - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> df = vcat(df, df) -8×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 1 1 - 6 │ 2 2 - 7 │ 3 1 - 8 │ 4 2 - -julia> nonunique(df) -8-element Vector{Bool}: - 0 - 0 - 0 - 0 - 1 - 1 - 1 - 1 - -julia> nonunique(df, 2) -8-element Vector{Bool}: - 0 - 0 - 1 - 1 - 1 - 1 - 1 - 1 -``` -""" -function nonunique(df::AbstractDataFrame) - ncol(df) == 0 && return Bool[] - gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3] - # unique rows are the first encountered group representatives, - # nonunique are everything else - res = fill(true, nrow(df)) - @inbounds for g_row in gslots - (g_row > 0) && (res[g_row] = false) - end - return res -end - -function nonunique(df::AbstractDataFrame, cols) - udf = _try_select_no_copy(df, cols) - if ncol(df) > 0 && ncol(udf) == 0 - throw(ArgumentError("finding duplicate rows in data frame when " * - "`cols` selects no columns is not allowed")) - else - return nonunique(udf) - end -end - -""" - allunique(df::AbstractDataFrame, cols=:) - -Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if -all their columns contain equal values (according to `isequal`). - -See also [`unique`](@ref) and [`nonunique`](@ref). - -# Arguments -- `df` : `AbstractDataFrame` -- `cols` : a selector specifying the column(s) or their transformations to compare. - Can be any column selector or transformation accepted by [`select`](@ref). - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> allunique(df) -true - -julia> allunique(df, :x) -false - -julia> allunique(df, :i => ByRow(isodd)) -false -``` -""" -function Base.allunique(df::AbstractDataFrame, cols=:) - udf = _try_select_no_copy(df, cols) - nrow(udf) == 0 && return true - return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)), - Val(false), nothing, false, nothing)[1] == nrow(df) -end - -""" - unique(df::AbstractDataFrame; view::Bool=false) - unique(df::AbstractDataFrame, cols; view::Bool=false) - -Return a data frame containing only the first occurrence of unique rows in `df`. -When `cols` is specified, the returned `DataFrame` contains complete rows, -retaining in each case the first occurrence of a given combination of values -in selected columns or their transformations. `cols` can be any column -selector or transformation accepted by [`select`](@ref). - -If `view=false` a freshly allocated `DataFrame` is returned, -and if `view=true` then a `SubDataFrame` view into `df` is returned. - -# Arguments -- `df` : the AbstractDataFrame -- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) -specifying the column(s) to compare. - -$METADATA_FIXED - -See also: [`unique!`](@ref), [`nonunique`](@ref). - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> df = vcat(df, df) -8×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 1 1 - 6 │ 2 2 - 7 │ 3 1 - 8 │ 4 2 - -julia> unique(df) # doesn't modify df -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> unique(df, 2) -2×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 -``` -""" -@inline function Base.unique(df::AbstractDataFrame; view::Bool=false) - rowidxs = (!).(nonunique(df)) - return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] -end - -@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false) - rowidxs = (!).(nonunique(df, cols)) - return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] -end - -""" - unique!(df::AbstractDataFrame) - unique!(df::AbstractDataFrame, cols) - -Update `df` in-place to contain only the first occurrence of unique rows in `df`. -When `cols` is specified, the returned `DataFrame` contains complete rows, -retaining in each case the first occurrence of a given combination of values -in selected columns or their transformations. `cols` can be any column -selector or transformation accepted by [`select`](@ref). - -# Arguments -- `df` : the AbstractDataFrame -- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) -specifying the column(s) to compare. - -$METADATA_FIXED - -See also: [`unique!`](@ref), [`nonunique`](@ref). - -# Examples - -```jldoctest -julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - -julia> df = vcat(df, df) -8×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 1 1 - 6 │ 2 2 - 7 │ 3 1 - 8 │ 4 2 - -julia> unique!(df) # modifies df -4×2 DataFrame - Row │ i x - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 1 - 4 │ 4 2 -``` -""" -Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df))) -Base.unique!(df::AbstractDataFrame, cols::AbstractVector) = - deleteat!(df, _findall(nonunique(df, cols))) -Base.unique!(df::AbstractDataFrame, cols) = - deleteat!(df, _findall(nonunique(df, cols))) - """ fillcombinations(df::AbstractDataFrame, indexcols; allowduplicates::Bool=false, @@ -1676,8 +1404,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols; "must be specified")) end - has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)), - Val(false), nothing, false, nothing)[1] != nrow(df) + # we use hashing algorithm here, because we assume that the tables we work with are not huge + has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)), + Val(false), nothing, false, nothing, true)[1] != nrow(df) if has_duplicates && !allowduplicates throw(ArgumentError("duplicate combinations of `indexcols` are not " * "allowed in input when `allowduplicates=false`")) @@ -3402,4 +3131,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta r = min(state + itr.n - 1, last_idx) return view(itr.c, state:r, :), r + 1 end - diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl new file mode 100644 index 0000000000..03cddfe74d --- /dev/null +++ b/src/abstractdataframe/unique.jl @@ -0,0 +1,376 @@ +""" + nonunique(df::AbstractDataFrame; keep::Symbol=:first) + nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first) + +Return a `Vector{Bool}` in which `true` entries indicate duplicate rows. + +Duplicate rows are those for which at least another row contains equal values +(according to `isequal`) for all columns in `cols` (by default, all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate +rows is indicated with a `false` entry. +If `keep=:last`, only the last occurrence of a set of duplicate rows is +indicated with a `false` entry. +If `keep=:noduplicates`, only rows without any duplicates are indicated with a +`false` entry. + +# Arguments +- `df` : `AbstractDataFrame` +- `cols` : a selector specifying the column(s) or their transformations to + compare. Can be any column selector or transformation accepted by + [`select`](@ref) that returns at least one column if `df` has at least one + column. + +See also [`unique`](@ref) and [`unique!`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> df = vcat(df, df) +8×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + 5 │ 1 1 + 6 │ 2 2 + 7 │ 3 1 + 8 │ 4 2 + +julia> nonunique(df) +8-element Vector{Bool}: + 0 + 0 + 0 + 0 + 1 + 1 + 1 + 1 + +julia> nonunique(df, keep=:last) +8-element Vector{Bool}: + 1 + 1 + 1 + 1 + 0 + 0 + 0 + 0 + +julia> nonunique(df, 2) +8-element Vector{Bool}: + 0 + 0 + 1 + 1 + 1 + 1 + 1 + 1 +``` +""" +function nonunique(df::AbstractDataFrame; keep::Symbol=:first) + if !(keep in (:first, :last, :noduplicates)) + throw(ArgumentError("`keep` must be :first, :last, or :noduplicates")) + end + ncol(df) == 0 && return Bool[] + res = fill(true, nrow(df)) + cols = ntuple(i -> df[!, i], ncol(df)) + if keep == :first + rpa = refpool_and_array.(cols) + refpools = first.(rpa) + refarrays = last.(rpa) + # if refarray cannot be used, we can avoid allocating a groups vector + if any(isnothing, refpools) || any(isnothing, refarrays) + _, _, gslots, _ = row_group_slots!(cols, Val(true), nothing, + false, nothing, false) + # unique rows are the first encountered group representatives, + # nonunique are everything else + @inbounds for g_row in gslots + g_row > 0 && (res[g_row] = false) + end + else # faster refarray method but allocates a groups vector + groups = Vector{Int}(undef, nrow(df)) + ngroups = row_group_slots!(cols, refpools, refarrays, + Val(false), groups, false, false, false)[1] + seen = fill(false, ngroups) + for i in 1:nrow(df) + g = groups[i] + if !seen[g] + seen[g] = true + res[i] = false + end + end + end + else + # always allocate a group vector, use refarray automatically if possible + groups = Vector{Int}(undef, nrow(df)) + ngroups = row_group_slots!(cols, Val(false), groups, false, nothing, false)[1] + if keep == :last + seen = fill(false, ngroups) + for i in nrow(df):-1:1 + g = groups[i] + if !seen[g] + seen[g] = true + res[i] = false + end + end + else + @assert keep == :noduplicates + # -1 indicates that we have not seen the group yet + # positive value indicates the first position we have seen the group + # 0 indicates that we have seen the group at least twice + firstseen = fill(-1, ngroups) + for i in 1:nrow(df) + g = groups[i] + j = firstseen[g] + if j == -1 + # this is possibly a non duplicate row + firstseen[g] = i + res[i] = false + elseif j > 0 + # the row had a duplicate + res[j] = true + firstseen[g] = 0 + end + end + end + end + return res +end + +function nonunique(df::AbstractDataFrame, cols; keep::Symbol=:first) + udf = _try_select_no_copy(df, cols) + if ncol(df) > 0 && ncol(udf) == 0 + throw(ArgumentError("finding duplicate rows in data frame when " * + "`cols` selects no columns is not allowed")) + end + return nonunique(udf, keep=keep) +end + +""" + allunique(df::AbstractDataFrame, cols=:) + +Return `true` if none of the rows of `df` are duplicated. Two rows are +duplicates if all their columns contain equal values (according to `isequal`) +for all columns in `cols` (by default, all columns). + +# Arguments +- `df` : `AbstractDataFrame` +- `cols` : a selector specifying the column(s) or their transformations to + compare. Can be any column selector or transformation accepted by + [`select`](@ref). + +See also [`unique`](@ref) and [`nonunique`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> allunique(df) +true + +julia> allunique(df, :x) +false + +julia> allunique(df, :i => ByRow(isodd)) +false +``` +""" +function Base.allunique(df::AbstractDataFrame, cols=:) + udf = _try_select_no_copy(df, cols) + nrow(udf) == 0 && return true + return row_group_slots!(ntuple(i -> udf[!, i], ncol(udf)), + Val(false), nothing, false, nothing, true)[1] == nrow(df) +end + +""" + unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first) + unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first) + +Return a data frame containing only unique rows in `df`. + +Non-unique (duplicate) rows are those for which at least another row contains +equal values (according to `isequal`) for all columns in `cols` (by default, +all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate +rows is kept. +If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. +If `keep=:noduplicates`, only rows without any duplicates are kept. + +If `view=false` a freshly allocated `DataFrame` is returned, and if `view=true` +then a `SubDataFrame` view into `df` is returned. + +# Arguments +- `df` : the AbstractDataFrame +- `cols` : a selector specifying the column(s) or their transformations to + compare. Can be any column selector or transformation accepted by + [`select`](@ref) that returns at least one column if `df` has at least one + column. + +$METADATA_FIXED + +See also: [`unique!`](@ref), [`nonunique`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> df = vcat(df, df) +8×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + 5 │ 1 1 + 6 │ 2 2 + 7 │ 3 1 + 8 │ 4 2 + +julia> unique(df) # doesn't modify df +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> unique(df, 2) +2×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + +julia> unique(df, keep=:noduplicates) +0×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┴────────────── +``` +""" +@inline function Base.unique(df::AbstractDataFrame; view::Bool=false, + keep::Symbol=:first) + rowidxs = (!).(nonunique(df, keep=keep)) + return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] +end + +@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false, + keep::Symbol=:first) + rowidxs = (!).(nonunique(df, cols, keep=keep)) + return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] +end + +""" + unique!(df::AbstractDataFrame; keep::Symbol=:first) + unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) + +Update `df` in-place to containi only unique rows. + +Non-unique (duplicate) rows are those for which at least another row contains +equal values (according to `isequal`) for all columns in `cols` (by default, +all columns). +If `keep=:first` (the default), only the first occurrence of a set of duplicate +rows is kept. +If `keep=:last`, only the last occurrence of a set of duplicate rows is kept. +If `keep=:noduplicates`, only rows without any duplicates are kept. + +# Arguments +- `df` : the AbstractDataFrame +- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.) + specifying the column(s) to compare. Can be any column selector or + transformation accepted by [`select`](@ref) that returns at least one column + if `df` has at least one column. + +$METADATA_FIXED + +See also: [`unique!`](@ref), [`nonunique`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> df = vcat(df, df) +8×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + 5 │ 1 1 + 6 │ 2 2 + 7 │ 3 1 + 8 │ 4 2 + +julia> unique!(copy(df)) # modifies df +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> unique(df, keep=:noduplicates) +0×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┴────────────── +``` +""" +Base.unique!(df::AbstractDataFrame; keep::Symbol=:first) = + deleteat!(df, _findall(nonunique(df, keep=keep))) +Base.unique!(df::AbstractDataFrame, cols::AbstractVector; keep::Symbol=:first) = + deleteat!(df, _findall(nonunique(df, cols, keep=keep))) +Base.unique!(df::AbstractDataFrame, cols; keep::Symbol=:first) = + deleteat!(df, _findall(nonunique(df, cols, keep=keep))) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index f6d4bf9c69..d08bef7f55 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -223,7 +223,7 @@ function groupby(df::AbstractDataFrame, cols; (cols isa AbstractVector && any(x -> x isa UserColOrdering, cols)) if isnothing(sort) || sort === true # if sort === true replace it with NamedTuple to avoid sorting - # in row_group_slots as we will perform sorting later + # in row_group_slots! as we will perform sorting later sort = NamedTuple() elseif sort === false throw(ArgumentError("passing `order` is only allowed if `sort` " * @@ -248,13 +248,14 @@ function groupby(df::AbstractDataFrame, cols; groups = Vector{Int}(undef, nrow(df)) ngroups, rhashes, gslots, sorted = - row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), - groups, skipmissing, sort isa NamedTuple ? nothing : sort) + row_group_slots!(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), + groups, skipmissing, + sort isa NamedTuple ? nothing : sort, true) gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing, Threads.ReentrantLock()) - # sort groups if row_group_slots hasn't already done that + # sort groups if row_group_slots! hasn't already done that if (sort === true && !sorted) || (sort isa NamedTuple) # Find index of representative row for each group idx = Vector{Int}(undef, length(gd)) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 3139f30339..d8acb7983a 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -82,12 +82,12 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int, # IntegerRefarray and IntegerRefPool are two complementary view types that allow # wrapping arrays with Union{Real, Missing} eltype to satisfy the DataAPI.refpool -# and DataAPI.refarray API when calling row_group_slots. +# and DataAPI.refarray API when calling row_group_slots!. # IntegerRefarray converts values to Int and replaces missing with an integer # (set by the caller to the maximum value + 1) # IntegerRefPool subtracts the minimum value - 1 and replaces back the maximum # value + 1 to missing. This ensures all values are in 1:length(refpool), while -# row_group_slots knows the number of (potential) groups via length(refpool) +# row_group_slots! knows the number of (potential) groups via length(refpool) # and is able to skip missing values when skipmissing=true struct IntegerRefarray{T<:AbstractArray} <: AbstractVector{Int} @@ -157,7 +157,7 @@ function refpool_and_array(x::AbstractArray) minval, maxval = extrema(x) end ngroups = big(maxval) - big(minval) + 1 - # Threshold chosen with the same rationale as the row_group_slots refpool method: + # Threshold chosen with the same rationale as the row_group_slots! refpool method: # refpool approach is faster but we should not allocate too much memory either # We also have to avoid overflow, including with ngroups + 1 for missing values # (note that it would be possible to allow minval and maxval to be outside of the @@ -178,14 +178,22 @@ end # 2) vector of row hashes (may be empty if hash=Val(false)) # 3) slot array for a hash map, non-zero values are # the indices of the first row in a group +# (returned only if hashes are generated) # 4) whether groups are already sorted # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. -function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, - hash::Val, - groups::Union{Vector{Int}, Nothing}, - skipmissing::Bool, - sort::Union{Bool, Nothing})::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} +# +# Also the last argument is `compress`. If it is `false` then groups are not +# compressed to form a continuous sequence. Normally `true` should be passed +# as this ensures that returned `ngroups` indeed indicates the number of groups +# but e.g. in `nonunique` we do not use this information so compressing +# can be skipped by passing `compress=false` +function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, + hash::Val, + groups::Union{Vector{Int}, Nothing}, + skipmissing::Bool, + sort::Union{Bool, Nothing}, + compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} rpa = refpool_and_array.(cols) if sort === false refpools = nothing @@ -194,17 +202,19 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, refpools = first.(rpa) refarrays = last.(rpa) end - row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort === true) + row_group_slots!(cols, refpools, refarrays, hash, groups, skipmissing, + sort === true, compress) end # Generic fallback method based on open addressing hash table -function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, - refpools::Any, # Ignored - refarrays::Any, # Ignored - hash::Val, - groups::Union{Vector{Int}, Nothing}, - skipmissing::Bool, - sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} +function row_group_slots!(cols::Tuple{Vararg{AbstractVector}}, + refpools::Any, # Ignored + refarrays::Any, # Ignored + hash::Val, + groups::Union{Vector{Int}, Nothing}, + skipmissing::Bool, + sort::Bool, + compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} @assert groups === nothing || length(groups) == length(cols[1]) rhashes, missings = hashrows(cols, skipmissing) # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481 @@ -251,16 +261,17 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, end # Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector -function row_group_slots(cols::NTuple{N, AbstractVector}, - refpools::NTuple{N, AbstractVector}, - refarrays::NTuple{N, - Union{AbstractVector{<:Real}, - Missings.EachReplaceMissing{ - <:AbstractVector{<:Union{Real, Missing}}}}}, - hash::Val{false}, - groups::Vector{Int}, - skipmissing::Bool, - sort::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N +function row_group_slots!(cols::NTuple{N, AbstractVector}, + refpools::NTuple{N, AbstractVector}, + refarrays::NTuple{N, + Union{AbstractVector{<:Real}, + Missings.EachReplaceMissing{ + <:AbstractVector{<:Union{Real, Missing}}}}}, + hash::Val{false}, + groups::Vector{Int}, + skipmissing::Bool, + sort::Bool, + compress::Bool)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N # Computing neither hashes nor groups isn't very useful, # and this method needs to allocate a groups vector anyway @assert all(col -> length(col) == length(groups), cols) @@ -296,10 +307,10 @@ function row_group_slots(cols::NTuple{N, AbstractVector}, newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) || !(refarrays isa NTuple{<:Any, AbstractVector}) || sort ? cols : refarrays - return invoke(row_group_slots, + return invoke(row_group_slots!, Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val, - Union{Vector{Int}, Nothing}, Bool, Bool}, - newcols, refpools, refarrays, hash, groups, skipmissing, sort) + Union{Vector{Int}, Nothing}, Bool, Bool, Bool}, + newcols, refpools, refarrays, hash, groups, skipmissing, sort, compress) end strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N, Int} @@ -428,7 +439,9 @@ function row_group_slots(cols::NTuple{N, AbstractVector}, # If some groups are unused, compress group indices to drop them # sum(seen) is faster than all(seen) when not short-circuiting, # and short-circuit would only happen in the slower case anyway - if sum(seen) < length(seen) + # + # This process is not needed if row_group_slots! is called with compress=false + if compress && sum(seen) < length(seen) oldngroups = ngroups remap = zeros(Int, ngroups) ngroups = 0 diff --git a/test/data.jl b/test/data.jl index 3399ad35e7..b5348c5705 100644 --- a/test/data.jl +++ b/test/data.jl @@ -229,62 +229,6 @@ end @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) end -@testset "nonunique, nonunique, unique! with extra argument" begin - df1 = DataFrame(a=Union{String, Missing}["a", "b", "a", "b", "a", "b"], - b=Vector{Union{Int, Missing}}(1:6), - c=Union{Int, Missing}[1:3;1:3]) - df = vcat(df1, df1) - @test findall(nonunique(df)) == collect(7:12) - @test findall(nonunique(df, :)) == collect(7:12) - @test findall(nonunique(df, Colon())) == collect(7:12) - @test findall(nonunique(df, :a)) == collect(3:12) - @test findall(nonunique(df, "a")) == collect(3:12) - @test findall(nonunique(df, [:a, :c])) == collect(7:12) - @test findall(nonunique(df, ["a", "c"])) == collect(7:12) - @test findall(nonunique(df, r"[ac]")) == collect(7:12) - @test findall(nonunique(df, Not(2))) == collect(7:12) - @test findall(nonunique(df, Not([2]))) == collect(7:12) - @test findall(nonunique(df, Not(:b))) == collect(7:12) - @test findall(nonunique(df, Not([:b]))) == collect(7:12) - @test findall(nonunique(df, Not([false, true, false]))) == collect(7:12) - @test findall(nonunique(df, [1, 3])) == collect(7:12) - @test findall(nonunique(df, 1)) == collect(3:12) - @test findall(nonunique(df, :a => x -> 1)) == 2:12 - - @test unique(df) == df1 - @test unique(df, :) == df1 - @test unique(df, Colon()) == df1 - @test unique(df, 2:3) == df1 - @test unique(df, 3) == df1[1:3, :] - @test unique(df, [1, 3]) == df1 - @test unique(df, [:a, :c]) == df1 - @test unique(df, ["a", "c"]) == df1 - @test unique(df, r"[ac]") == df1 - @test unique(df, Not(2)) == df1 - @test unique(df, Not([2])) == df1 - @test unique(df, Not(:b)) == df1 - @test unique(df, Not([:b])) == df1 - @test unique(df, Not([false, true, false])) == df1 - @test unique(df, :a) == df1[1:2, :] - @test unique(df, "a") == df1[1:2, :] - @test unique(df, :a => x -> 1) == df[1:1, :] - @test unique(DataFrame()) == DataFrame() - @test isempty(nonunique(DataFrame())) && nonunique(DataFrame()) isa Vector{Bool} - @test_throws ArgumentError nonunique(DataFrame(a=1:3), []) - @test_throws ArgumentError unique(DataFrame(a=1:3), []) - - @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) == - df1[1:2, :] - - unique!(df, [1, 3]) - @test df == df1 - for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false])) - df = vcat(df1, df1) - unique!(df, cols) - @test df == df1 - end -end - @testset "filter() and filter!()" begin df = DataFrame(x=[3, 1, 2, 1], y=["b", "c", "a", "b"]) @test filter(r -> r[:x] > 1, df) == DataFrame(x=[3, 2], y=["b", "a"]) diff --git a/test/duplicates.jl b/test/duplicates.jl index ec85020c02..61c01874d2 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -1,6 +1,6 @@ module TestDuplicates -using Test, DataFrames, CategoricalArrays +using Test, DataFrames, CategoricalArrays, Random const ≅ = isequal @testset "nonunique" begin @@ -41,4 +41,138 @@ const ≅ = isequal @test_throws ArgumentError unique(pdf, true) end +@testset "nonunique, nonunique, unique! with extra argument" begin + df1 = DataFrame(a=Union{String, Missing}["a", "b", "a", "b", "a", "b"], + b=Vector{Union{Int, Missing}}(1:6), + c=Union{Int, Missing}[1:3;1:3]) + df = vcat(df1, df1) + @test findall(nonunique(df)) == collect(7:12) + @test findall(nonunique(df, :)) == collect(7:12) + @test findall(nonunique(df, Colon())) == collect(7:12) + @test findall(nonunique(df, :a)) == collect(3:12) + @test findall(nonunique(df, "a")) == collect(3:12) + @test findall(nonunique(df, [:a, :c])) == collect(7:12) + @test findall(nonunique(df, ["a", "c"])) == collect(7:12) + @test findall(nonunique(df, r"[ac]")) == collect(7:12) + @test findall(nonunique(df, Not(2))) == collect(7:12) + @test findall(nonunique(df, Not([2]))) == collect(7:12) + @test findall(nonunique(df, Not(:b))) == collect(7:12) + @test findall(nonunique(df, Not([:b]))) == collect(7:12) + @test findall(nonunique(df, Not([false, true, false]))) == collect(7:12) + @test findall(nonunique(df, [1, 3])) == collect(7:12) + @test findall(nonunique(df, 1)) == collect(3:12) + @test findall(nonunique(df, :a => x -> 1)) == 2:12 + + @test unique(df) == df1 + @test unique(df, :) == df1 + @test unique(df, Colon()) == df1 + @test unique(df, 2:3) == df1 + @test unique(df, 3) == df1[1:3, :] + @test unique(df, [1, 3]) == df1 + @test unique(df, [:a, :c]) == df1 + @test unique(df, ["a", "c"]) == df1 + @test unique(df, r"[ac]") == df1 + @test unique(df, Not(2)) == df1 + @test unique(df, Not([2])) == df1 + @test unique(df, Not(:b)) == df1 + @test unique(df, Not([:b])) == df1 + @test unique(df, Not([false, true, false])) == df1 + @test unique(df, :a) == df1[1:2, :] + @test unique(df, "a") == df1[1:2, :] + @test unique(df, :a => x -> 1) == df[1:1, :] + @test unique(DataFrame()) == DataFrame() + @test isempty(nonunique(DataFrame())) && nonunique(DataFrame()) isa Vector{Bool} + @test_throws ArgumentError nonunique(DataFrame(a=1:3), []) + @test_throws ArgumentError unique(DataFrame(a=1:3), []) + + @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) == + df1[1:2, :] + + unique!(df, [1, 3]) + @test df == df1 + for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false])) + df = vcat(df1, df1) + unique!(df, cols) + @test df == df1 + end +end + +@testset "keep argument to nonunique/unique/unique!" begin + df = DataFrame(a=[1, 2, 3, 1, 2, 1], + b=["a", "b", "c", "a", "b", "a"], + c=categorical(["a", "b", "c", "a", "b", "a"])) + for cols in (1, 2, 3, [1, 2], [1, 3], [2, 3], [1, 2, 3]) + @test nonunique(df, cols, keep=:first) == + [false, false, false, true, true, true] + @test nonunique(df, cols, keep=:last) == + [true, true, false, true, false, false] + @test nonunique(df, cols, keep=:noduplicates) == + [true, true, false, true, true, true] + @test nonunique(select(df, cols), keep=:first) == + [false, false, false, true, true, true] + @test nonunique(select(df, cols), keep=:last) == + [true, true, false, true, false, false] + @test nonunique(select(df, cols), keep=:noduplicates) == + [true, true, false, true, true, true] + + @test unique(df, cols, keep=:first) == + df[.![false, false, false, true, true, true], :] + @test unique(df, cols, keep=:last) == + df[.![true, true, false, true, false, false], :] + @test unique(df, cols, keep=:noduplicates) == + df[.![true, true, false, true, true, true], :] + @test unique(select(df, cols), keep=:first) == + df[.![false, false, false, true, true, true], Cols(cols)] + @test unique(select(df, cols), keep=:last) == + df[.![true, true, false, true, false, false], Cols(cols)] + @test unique(select(df, cols), keep=:noduplicates) == + df[.![true, true, false, true, true, true], Cols(cols)] + + @test unique!(copy(df), cols, keep=:first) == + df[.![false, false, false, true, true, true], :] + @test unique!(copy(df), cols, keep=:last) == + df[.![true, true, false, true, false, false], :] + @test unique!(copy(df), cols, keep=:noduplicates) == + df[.![true, true, false, true, true, true], :] + @test unique!(select(df, cols), keep=:first) == + df[.![false, false, false, true, true, true], Cols(cols)] + @test unique!(select(df, cols), keep=:last) == + df[.![true, true, false, true, false, false], Cols(cols)] + @test unique!(select(df, cols), keep=:noduplicates) == + df[.![true, true, false, true, true, true], Cols(cols)] + end + + # some larger randomized test + Random.seed!(1234) + df = DataFrame(a=rand(1:10^5, 10^5)) + df.b = string.(df.a) + df.c = categorical(df.b) + df.id = 1:10^5 + + for cols in (1, 2, 3, [1, 2], [1, 3], [2, 3], [1, 2, 3]) + @test select(unique(df, cols, keep=:first), cols, Not(cols)) == + combine(groupby(df, cols, sort=false), first) + @test select(unique(df, cols, keep=:last), cols, Not(cols)) == + sort(combine(groupby(df, cols, sort=false), last), :id) + @test select(unique(df, cols, keep=:noduplicates), cols, Not(cols)) == + sort(combine(groupby(df, cols, sort=false), + sdf -> nrow(sdf) == 1 ? sdf : NamedTuple()), :id) + end + + @test isempty(nonunique(DataFrame(), keep=:first)) + @test unique(DataFrame(a=[]), keep=:last) == DataFrame(a=[]) + @test unique!(DataFrame(), keep=:noduplicates) == DataFrame() + @test_throws ArgumentError nonunique(DataFrame(), keep=:a) + @test_throws ArgumentError unique(DataFrame(), keep=:b) + @test_throws ArgumentError unique!(DataFrame(), keep=:c) +end + +@testset "case when groups are not compressed in row_group_slots!" begin + df = DataFrame(x=repeat([1:1000; -1], 2)); + @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [-1; 1:1000] + @test nonunique(df, :x) == [falses(1001); trues(1001)] + @test nonunique(df, :x, keep=:last) == [trues(1001); falses(1001)] + @test all(nonunique(df, :x, keep=:noduplicates)) +end + end # module