Skip to content

Commit

Permalink
Improve allcombinations docstring + minor cleanups after #3256 (#3276)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Jan 27, 2023
1 parent fdd9193 commit 70d1e23
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 12 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
its parent
([#3273](https://github.com/JuliaData/DataFrames.jl/pull/3273))

## Performance improvements

* `dropmissing` creates new columns in a single pass if `disallowmissing=true`
([#3256](https://github.com/JuliaData/DataFrames.jl/pull/3256))

# DataFrames.jl v1.4.4 Patch Release Notes

## Bug fixes
Expand Down
15 changes: 11 additions & 4 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -985,11 +985,19 @@ julia> dropmissing(df, [:x, :y])

# What column indices should disallowmissing be applied to
cols_inds = BitSet(index(df)[cols])

use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
@sync for (i, col) in enumerate(eachcol(df))
@spawn_or_run use_threads if disallowmissing && (i in cols_inds)
new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows))
@spawn_or_run use_threads if disallowmissing && (i in cols_inds) &&
(Missing <: eltype(col) && eltype(col) !== Any)
# Perform this path only if column eltype allows missing values
# except Any, as nonmissingtype(Any) == Any.
# Under these conditions Missings.disallowmissing must allocate
# a fresh column
col_sel = Base.view(col, selected_rows)
new_col = Missings.disallowmissing(col_sel)
@assert new_col !== col_sel
new_columns[i] = new_col
else
new_columns[i] = col[selected_rows]
end
Expand Down Expand Up @@ -3421,4 +3429,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta
r = min(state + itr.n - 1, last_idx)
return view(itr.c, state:r, :), r + 1
end

1 change: 1 addition & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1472,6 +1472,7 @@ allcombinations(::Type{DataFrame}, pairs::Pair{<:AbstractString, <:Any}...) =
allcombinations(DataFrame; kwargs...)
Create a `DataFrame` from all combinations of values in passed arguments.
The first passed values vary fastest.
Arguments associating a column name with values to expand can be specified
either as `Pair`s passed as positional arguments, or as keyword arguments.
Expand Down
22 changes: 14 additions & 8 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,11 @@ end
@test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
df = DataFrame(a=1:3, b=4:6)
dfv = @view df[:, 2:1]
# TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved
# @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
@test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
@test_throws ArgumentError dropmissing!(dfv)
@test_throws ArgumentError dropmissing(df1, [])
@test_throws ArgumentError dropmissing!(df1, [])
@test_throws ArgumentError dropmissing(df1, [])
@test_throws ArgumentError dropmissing!(df1, [])

df = DataFrame(a=[1, missing, 3])
sdf = view(df, :, :)
@test dropmissing(sdf) == DataFrame(a=[1, 3])
Expand Down Expand Up @@ -248,19 +247,26 @@ end
# Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing
for i in 1:ncol(df)
missing_mask = (eachindex(df[!, i]) .% 10) .== i
df[missing_mask, i] .= missing
df[missing_mask, i] .= missing
end

notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)]
@test dropmissing(df) df[notmissing_rows, :]

cols = [:x1, :x2]
notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)]
returned = dropmissing(df, cols)
@test returned df[notmissing_rows, :]
@test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]]))
@test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]]))
@test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)])

# correct handling of not propagating views
df = DataFrame(a=1:3, b=Any[11, missing, 13])
df2 = dropmissing(df)
@test df2 == DataFrame(a=[1, 3], b=[11, 13])
@test df2.a isa Vector{Int}
@test df2.b isa Vector{Any}
end

@testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin
Expand Down

0 comments on commit 70d1e23

Please sign in to comment.