diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 9b9de28471..625cf60665 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -25,6 +25,8 @@ This is a list of operations that currently make use of multi-threading: * a transformation produces one row per group and the passed transformation is a custom function (i.e. not for standard reductions, which use optimized single-threaded methods). +- `dropmissing` when the provided data frame has more than 1 column and `view=false` + (subsetting of individual columns is spawned in separate tasks). In general at least Julia 1.4 is required to ensure that multi-threading is used and the Julia process must be started with more than one thread. Some operations diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 9fba690d49..4d7db3c5bc 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -970,6 +970,7 @@ julia> dropmissing(df, [:x, :y]) @inline function dropmissing(df::AbstractDataFrame, cols::Union{ColumnIndex, MultiColumnIndex}=:; view::Bool=false, disallowmissing::Bool=!view) + # Identify Bool mask of which rows have no missings rowidxs = completecases(df, cols) if view if disallowmissing @@ -977,8 +978,26 @@ julia> dropmissing(df, [:x, :y]) end return Base.view(df, rowidxs, :) else - newdf = df[rowidxs, :] - disallowmissing && disallowmissing!(newdf, cols) + # Faster when there are many columns (indexing with integers than via Bool mask) + # or when there are many missings (as we skip a lot of iterations) + selected_rows = _findall(rowidxs) + new_columns = Vector{AbstractVector}(undef, ncol(df)) + + # What column indices should disallowmissing be applied to + cols_inds = BitSet(index(df)[cols]) + + use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000 + @sync for (i, col) in enumerate(eachcol(df)) + @spawn_or_run use_threads if disallowmissing && (i in cols_inds) + new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows)) + else + new_columns[i] = col[selected_rows] + end + end + + newdf = DataFrame(new_columns, copy(index(df)), copycols=false) + + _copy_all_note_metadata!(newdf, df) return newdf end end diff --git a/test/data.jl b/test/data.jl index 3399ad35e7..02d953dd59 100644 --- a/test/data.jl +++ b/test/data.jl @@ -168,6 +168,17 @@ end @test df1b == df1 end + # Zero column case + @test isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame + @test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame + df = DataFrame(a=1:3, b=4:6) + dfv = @view df[:, 2:1] + # TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved + # @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame + @test_throws ArgumentError dropmissing!(dfv) + @test_throws ArgumentError dropmissing(df1, []) + @test_throws ArgumentError dropmissing!(df1, []) + df = DataFrame(a=[1, missing, 3]) sdf = view(df, :, :) @test dropmissing(sdf) == DataFrame(a=[1, 3]) @@ -186,6 +197,16 @@ end @test eltype(df2.a) == Union{Int, Missing} @test df.a == df2.a == [1, 3] + # view=true + df = DataFrame(a=[1, missing, 3]) + @test dropmissing(df, view=false) == DataFrame(a=[1, 3]) + @test dropmissing(df, view=true) == view(df, [1, 3], :) + @test typeof(dropmissing(df, view=true)) <: SubDataFrame + @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing} + @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) + @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing} + @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int + a = [1, 2] df = DataFrame(a=a, copycols=false) @test dropmissing!(df) === df @@ -200,6 +221,46 @@ end df = DataFrame(b=b) @test eltype(dropmissing(df).b) == Int @test eltype(dropmissing!(df).b) == Int + + # disallowmissing argument + a = Union{Int, Missing}[3, 4] + b = Union{Int, Missing}[1, 2] + df = DataFrame(;a,b) + @test eltype(dropmissing(df, disallowmissing=false).a) == Union{Int, Missing} + @test eltype(dropmissing!(copy(df), disallowmissing=false).a) == Union{Int, Missing} + @test eltype(dropmissing(df, disallowmissing=true).a) == Int + @test eltype(dropmissing!(copy(df), disallowmissing=true).a) == Int + @test eltype(dropmissing(df, :a, disallowmissing=true).a) == Int + @test eltype(dropmissing!(copy(df), :a, disallowmissing=true).a) == Int + @test eltype(dropmissing(df, :b, disallowmissing=true).a) == Union{Int, Missing} + @test eltype(dropmissing!(copy(df), :b, disallowmissing=true).a) == Union{Int, Missing} + + # CategoricalArrays + c = categorical([1, 2, 1, missing]) + df = DataFrame(c=c) + @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1])) + @test eltype(dropmissing(df).c) == CategoricalValue{Int, UInt32} + @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32} + + # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000) + N_rows, N_cols = 110_000, 3 + df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing + # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing + for i in 1:ncol(df) + missing_mask = (eachindex(df[!, i]) .% 10) .== i + df[missing_mask, i] .= missing + end + + notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)] + @test dropmissing(df) ≅ df[notmissing_rows, :] + + cols = [:x1, :x2] + notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)] + returned = dropmissing(df, cols) + @test returned ≅ df[notmissing_rows, :] + @test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]])) + @test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]])) + @test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)]) end @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin