From 70d1e233363566b23e8298d3f052da4681380f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 27 Jan 2023 22:06:44 +0100 Subject: [PATCH] Improve allcombinations docstring + minor cleanups after #3256 (#3276) --- NEWS.md | 5 +++++ src/abstractdataframe/abstractdataframe.jl | 15 +++++++++++---- src/dataframe/dataframe.jl | 1 + test/data.jl | 22 ++++++++++++++-------- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index c1cc03dbaa..78c369edbb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,11 @@ its parent ([#3273](https://github.com/JuliaData/DataFrames.jl/pull/3273)) +## Performance improvements + +* `dropmissing` creates new columns in a single pass if `disallowmissing=true` + ([#3256](https://github.com/JuliaData/DataFrames.jl/pull/3256)) + # DataFrames.jl v1.4.4 Patch Release Notes ## Bug fixes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 4d7db3c5bc..fcdba42513 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -985,11 +985,19 @@ julia> dropmissing(df, [:x, :y]) # What column indices should disallowmissing be applied to cols_inds = BitSet(index(df)[cols]) - + use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000 @sync for (i, col) in enumerate(eachcol(df)) - @spawn_or_run use_threads if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows)) + @spawn_or_run use_threads if disallowmissing && (i in cols_inds) && + (Missing <: eltype(col) && eltype(col) !== Any) + # Perform this path only if column eltype allows missing values + # except Any, as nonmissingtype(Any) == Any. + # Under these conditions Missings.disallowmissing must allocate + # a fresh column + col_sel = Base.view(col, selected_rows) + new_col = Missings.disallowmissing(col_sel) + @assert new_col !== col_sel + new_columns[i] = new_col else new_columns[i] = col[selected_rows] end @@ -3421,4 +3429,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta r = min(state + itr.n - 1, last_idx) return view(itr.c, state:r, :), r + 1 end - diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 01e05fc206..925ef408cb 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1472,6 +1472,7 @@ allcombinations(::Type{DataFrame}, pairs::Pair{<:AbstractString, <:Any}...) = allcombinations(DataFrame; kwargs...) Create a `DataFrame` from all combinations of values in passed arguments. +The first passed values vary fastest. Arguments associating a column name with values to expand can be specified either as `Pair`s passed as positional arguments, or as keyword arguments. diff --git a/test/data.jl b/test/data.jl index 02d953dd59..ba72a8dfa4 100644 --- a/test/data.jl +++ b/test/data.jl @@ -173,12 +173,11 @@ end @test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame df = DataFrame(a=1:3, b=4:6) dfv = @view df[:, 2:1] - # TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved - # @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame + @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame @test_throws ArgumentError dropmissing!(dfv) - @test_throws ArgumentError dropmissing(df1, []) - @test_throws ArgumentError dropmissing!(df1, []) - + @test_throws ArgumentError dropmissing(df1, []) + @test_throws ArgumentError dropmissing!(df1, []) + df = DataFrame(a=[1, missing, 3]) sdf = view(df, :, :) @test dropmissing(sdf) == DataFrame(a=[1, 3]) @@ -248,12 +247,12 @@ end # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing for i in 1:ncol(df) missing_mask = (eachindex(df[!, i]) .% 10) .== i - df[missing_mask, i] .= missing + df[missing_mask, i] .= missing end - + notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)] @test dropmissing(df) ≅ df[notmissing_rows, :] - + cols = [:x1, :x2] notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)] returned = dropmissing(df, cols) @@ -261,6 +260,13 @@ end @test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]])) @test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]])) @test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)]) + + # correct handling of not propagating views + df = DataFrame(a=1:3, b=Any[11, missing, 13]) + df2 = dropmissing(df) + @test df2 == DataFrame(a=[1, 3], b=[11, 13]) + @test df2.a isa Vector{Int} + @test df2.b isa Vector{Any} end @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin