From 940c1cd006c9b763a0705ca5fc921257d2e394ba Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Fri, 23 Dec 2022 15:45:48 -0500 Subject: [PATCH 01/40] init --- src/abstractdataframe/abstractdataframe.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 9fba690d49..7fe9d08c34 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -970,6 +970,7 @@ julia> dropmissing(df, [:x, :y]) @inline function dropmissing(df::AbstractDataFrame, cols::Union{ColumnIndex, MultiColumnIndex}=:; view::Bool=false, disallowmissing::Bool=!view) + # Changes to be made here rowidxs = completecases(df, cols) if view if disallowmissing From 472a7157d4e7d4d8755b59ba658bb3622776041d Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sat, 14 Jan 2023 18:04:18 +0000 Subject: [PATCH 02/40] add dropmissing8 implementation --- src/abstractdataframe/abstractdataframe.jl | 20 +++++++++++ src/dataframe/dataframe.jl | 39 ++++++++++++++++++++++ src/other/utils.jl | 13 ++++++++ 3 files changed, 72 insertions(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 7fe9d08c34..1a2a616193 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -984,6 +984,26 @@ julia> dropmissing(df, [:x, :y]) end end +# new implementation; to be swapped with the above if design is okay +@inline function _dropmissing(df::AbstractDataFrame, + cols::Union{ColumnIndex,MultiColumnIndex}=:; + view::Bool=false, disallowmissing::Bool=!view) + rowidxs = completecases(df, cols) + if view + if disallowmissing + throw(ArgumentError("disallowmissing=true is incompatible with view=true")) + end + return Base.view(df, rowidxs, :) + else + if disallowmissing + newdf = _getindex_disallowmissing(df, rowidxs, cols) + else + newdf = df[rowidxs, :] + end + return newdf + end +end + """ dropmissing!(df::AbstractDataFrame, cols=:; disallowmissing::Bool=true) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 01e05fc206..ced4f0a092 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -625,6 +625,45 @@ Base.getindex(df::DataFrame, row_ind::Colon, col_inds::MultiColumnIndex) = Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) = select(df, index(df)[col_inds], copycols=false) +# Gets a subset of a DataFrame by `row_inds` and disallows missing values in columns specified by `col_disallowmissing_inds` +# Called by dropmissing() +function _getindex_disallowmissing(df::DataFrame, row_inds::AbstractVector{T}, + col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex,Colon}) where {T<:Integer} + + # With knowledge of exact indices we can skip a lot of loop iterations (as opposed to filtering by a mask) + selected_rows = T === Bool ? _findall(row_inds) : row_inds + new_columns = Vector{AbstractVector}(undef, ncol(df)) + df_columns = DataFrames._columns(df) + + # We need to know the exact indices of columns that disallowmissing should be applied to + disallowmissing_columns_inds = index(df)[col_disallowmissing_inds] + + # Threading decision rule borrowed from `_threaded_getindex` + if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000 + @sync for i in eachindex(new_columns) + # for each column, check if disallowmissing should be applied + Threads.@spawn if i in disallowmissing_columns_inds + @inbounds new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows) + else + @inbounds new_columns[i] = df_columns[i][selected_rows] + end + end + else + @inbounds for i in eachindex(new_columns) + if i in disallowmissing_columns_inds + new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows) + else + new_columns[i] = df_columns[i][selected_rows] + end + end + end + + newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false) + _copy_all_note_metadata!(newdf, df) + + return newdf +end + ############################################################################## ## ## setindex!() diff --git a/src/other/utils.jl b/src/other/utils.jl index dedd6ba997..2253c6c9b3 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -477,3 +477,16 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}} end @assert false "should not be reached" end + +# Creates a vector like `vect` without the missing type and copies only data at indices in `inds` +# Fuses two operations that were previously done by getindex + disallowmissing +# unsafe because it does not check for `inds` values to be not missing (ensured by the caller) +# Called by `_getindex_disallowmissing(df,...)` +function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T} + output = Vector{nonmissingtype(T)}(undef, length(inds)) + last_ind = 0 + for i in eachindex(inds) + output[last_ind+=1] = vect[inds[i]] + end + output +end \ No newline at end of file From 7b5e010fc088b45d68547b08727938b56eacabf3 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 15 Jan 2023 06:51:23 +0000 Subject: [PATCH 03/40] Update src/other/utils.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bogumił Kamiński --- src/other/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/other/utils.jl b/src/other/utils.jl index 2253c6c9b3..1e9173da3d 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -488,5 +488,5 @@ function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector for i in eachindex(inds) output[last_ind+=1] = vect[inds[i]] end - output + return output end \ No newline at end of file From dc705e8b43016705bbfc7e1f3ceb8af3035cb5b5 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 15 Jan 2023 07:13:32 +0000 Subject: [PATCH 04/40] move last_ind to separate line, add inbounds --- src/other/utils.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/other/utils.jl b/src/other/utils.jl index 1e9173da3d..27ebf20cde 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -485,8 +485,9 @@ end function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T} output = Vector{nonmissingtype(T)}(undef, length(inds)) last_ind = 0 - for i in eachindex(inds) - output[last_ind+=1] = vect[inds[i]] + @inbounds for i in eachindex(inds) + last_ind += 1 + output[last_ind] = vect[inds[i]] end return output end \ No newline at end of file From f52ed417e01d7dae1465d47f198699ea569be539 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 15 Jan 2023 07:25:37 +0000 Subject: [PATCH 05/40] support general vectors in `getindex_disallowmissing` Uses Tables.allocatecolumn constructor --- src/other/utils.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/other/utils.jl b/src/other/utils.jl index 27ebf20cde..8e027f1ab5 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -483,6 +483,17 @@ end # unsafe because it does not check for `inds` values to be not missing (ensured by the caller) # Called by `_getindex_disallowmissing(df,...)` function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T} + # General constructor; custom scalars can overload `DataAPI.defaultarray` to signal the default array type + output = Tables.allocatecolumn(nonmissingtype(T), length(inds)) + last_ind = 0 + @inbounds for i in eachindex(inds) + last_ind += 1 + output[last_ind] = vect[inds[i]] + end + return output +end + +function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T} output = Vector{nonmissingtype(T)}(undef, length(inds)) last_ind = 0 @inbounds for i in eachindex(inds) From 72171c97355d447bc9d964721f78d4dfaa4a4f20 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 15 Jan 2023 07:55:04 +0000 Subject: [PATCH 06/40] add general fallback for getindex_disallowmissing --- src/other/utils.jl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/other/utils.jl b/src/other/utils.jl index 8e027f1ab5..01cd80e3d4 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -483,14 +483,8 @@ end # unsafe because it does not check for `inds` values to be not missing (ensured by the caller) # Called by `_getindex_disallowmissing(df,...)` function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T} - # General constructor; custom scalars can overload `DataAPI.defaultarray` to signal the default array type - output = Tables.allocatecolumn(nonmissingtype(T), length(inds)) - last_ind = 0 - @inbounds for i in eachindex(inds) - last_ind += 1 - output[last_ind] = vect[inds[i]] - end - return output + # General fallback, relies on custom vectors' own methods + return vect[inds]|>disallowmissing end function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T} From 4a596ee0feb0731dc12b43b64c4f2216f273d9e4 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 15 Jan 2023 08:02:11 +0000 Subject: [PATCH 07/40] method for `getindex_disallowmissing` for abstractdataframe --- src/abstractdataframe/abstractdataframe.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 1a2a616193..c3570c067b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -984,6 +984,16 @@ julia> dropmissing(df, [:x, :y]) end end +@inline function _getindex_disallowmissing(df::AbstractDataFrame, + row_inds::AbstractVector{T}, + col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex}) where {T<:Integer} + + newdf = df[row_inds, :] + disallowmissing!(newdf, col_disallowmissing_inds) + + return df +end + # new implementation; to be swapped with the above if design is okay @inline function _dropmissing(df::AbstractDataFrame, cols::Union{ColumnIndex,MultiColumnIndex}=:; From 6326f32e0c1b32a7f67a9346637c4b9a9206eedc Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sat, 21 Jan 2023 11:18:14 +0000 Subject: [PATCH 08/40] changed inner to disallowmissing+view --- src/other/utils.jl | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/other/utils.jl b/src/other/utils.jl index 01cd80e3d4..4c32bb1070 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -483,16 +483,5 @@ end # unsafe because it does not check for `inds` values to be not missing (ensured by the caller) # Called by `_getindex_disallowmissing(df,...)` function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T} - # General fallback, relies on custom vectors' own methods - return vect[inds]|>disallowmissing -end - -function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T} - output = Vector{nonmissingtype(T)}(undef, length(inds)) - last_ind = 0 - @inbounds for i in eachindex(inds) - last_ind += 1 - output[last_ind] = vect[inds[i]] - end - return output + return disallowmissing(@view vect[inds]) end \ No newline at end of file From 080c0182ee040a08d1897921ddcd4ba6ac9cabb8 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sat, 21 Jan 2023 17:06:14 +0000 Subject: [PATCH 09/40] Clean up - merged all pathways into one function for all abstract dataframes - removed other methods - calling function disallowmissing explicitly from package Missings, as it others conflicts with the keyword name --- src/abstractdataframe/abstractdataframe.jl | 64 ++++++++++++---------- src/dataframe/dataframe.jl | 39 ------------- src/other/utils.jl | 8 --- 3 files changed, 35 insertions(+), 76 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index c3570c067b..84b768136b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -970,7 +970,7 @@ julia> dropmissing(df, [:x, :y]) @inline function dropmissing(df::AbstractDataFrame, cols::Union{ColumnIndex, MultiColumnIndex}=:; view::Bool=false, disallowmissing::Bool=!view) - # Changes to be made here + # Identify Bool mask of which rows have no Missings rowidxs = completecases(df, cols) if view if disallowmissing @@ -978,38 +978,44 @@ julia> dropmissing(df, [:x, :y]) end return Base.view(df, rowidxs, :) else - newdf = df[rowidxs, :] - disallowmissing && disallowmissing!(newdf, cols) - return newdf - end -end - -@inline function _getindex_disallowmissing(df::AbstractDataFrame, - row_inds::AbstractVector{T}, - col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex}) where {T<:Integer} - - newdf = df[row_inds, :] - disallowmissing!(newdf, col_disallowmissing_inds) + # With exact indices we can skip a lot of iterations (as opposed to indexing via a mask) + selected_rows = _findall(rowidxs) + new_columns = Vector{AbstractVector}(undef, ncol(df)) + df_columns = if typeof(df) == DataFrame + _columns(df) + else + # for SubDataFrame + collect(eachcol(df)) + end - return df -end + # What column indices should disallowmissing be applied to + cols_inds = index(df)[cols] -# new implementation; to be swapped with the above if design is okay -@inline function _dropmissing(df::AbstractDataFrame, - cols::Union{ColumnIndex,MultiColumnIndex}=:; - view::Bool=false, disallowmissing::Bool=!view) - rowidxs = completecases(df, cols) - if view - if disallowmissing - throw(ArgumentError("disallowmissing=true is incompatible with view=true")) - end - return Base.view(df, rowidxs, :) - else - if disallowmissing - newdf = _getindex_disallowmissing(df, rowidxs, cols) + # Threading decision rule borrowed from `_threaded_getindex` + if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000 + @sync for i in eachindex(new_columns) + # for each column, check if disallowmissing should be applied + Threads.@spawn if disallowmissing && (i in cols_inds) + new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows]) + else + new_columns[i] = df_columns[i][selected_rows] + end + end else - newdf = df[rowidxs, :] + for i in eachindex(new_columns) + if disallowmissing && (i in cols_inds) + new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows]) + else + new_columns[i] = df_columns[i][selected_rows] + end + end end + # Output will be DataFrame even if SubDataFrame is provided + # (assuming that view=false means user wants a DataFrame) + newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false) + + # technically, disallowmissing! drops all nonnote metadata but we never copied it, so no need to drop + _copy_all_note_metadata!(newdf, df) return newdf end end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index ced4f0a092..01e05fc206 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -625,45 +625,6 @@ Base.getindex(df::DataFrame, row_ind::Colon, col_inds::MultiColumnIndex) = Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) = select(df, index(df)[col_inds], copycols=false) -# Gets a subset of a DataFrame by `row_inds` and disallows missing values in columns specified by `col_disallowmissing_inds` -# Called by dropmissing() -function _getindex_disallowmissing(df::DataFrame, row_inds::AbstractVector{T}, - col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex,Colon}) where {T<:Integer} - - # With knowledge of exact indices we can skip a lot of loop iterations (as opposed to filtering by a mask) - selected_rows = T === Bool ? _findall(row_inds) : row_inds - new_columns = Vector{AbstractVector}(undef, ncol(df)) - df_columns = DataFrames._columns(df) - - # We need to know the exact indices of columns that disallowmissing should be applied to - disallowmissing_columns_inds = index(df)[col_disallowmissing_inds] - - # Threading decision rule borrowed from `_threaded_getindex` - if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000 - @sync for i in eachindex(new_columns) - # for each column, check if disallowmissing should be applied - Threads.@spawn if i in disallowmissing_columns_inds - @inbounds new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows) - else - @inbounds new_columns[i] = df_columns[i][selected_rows] - end - end - else - @inbounds for i in eachindex(new_columns) - if i in disallowmissing_columns_inds - new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows) - else - new_columns[i] = df_columns[i][selected_rows] - end - end - end - - newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false) - _copy_all_note_metadata!(newdf, df) - - return newdf -end - ############################################################################## ## ## setindex!() diff --git a/src/other/utils.jl b/src/other/utils.jl index 4c32bb1070..1f2deac792 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -476,12 +476,4 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}} end end @assert false "should not be reached" -end - -# Creates a vector like `vect` without the missing type and copies only data at indices in `inds` -# Fuses two operations that were previously done by getindex + disallowmissing -# unsafe because it does not check for `inds` values to be not missing (ensured by the caller) -# Called by `_getindex_disallowmissing(df,...)` -function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T} - return disallowmissing(@view vect[inds]) end \ No newline at end of file From a0bf4409a9ecf96934cb595368d970f00b0b997c Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 07:31:38 +0000 Subject: [PATCH 10/40] updated comments --- src/abstractdataframe/abstractdataframe.jl | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 84b768136b..d54312d4b6 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -978,15 +978,11 @@ julia> dropmissing(df, [:x, :y]) end return Base.view(df, rowidxs, :) else - # With exact indices we can skip a lot of iterations (as opposed to indexing via a mask) + # Faster when there are many columns (Indexing with Integers than via Bool mask) + # or when there are many Missing (as we skip a lot of iterations) selected_rows = _findall(rowidxs) new_columns = Vector{AbstractVector}(undef, ncol(df)) - df_columns = if typeof(df) == DataFrame - _columns(df) - else - # for SubDataFrame - collect(eachcol(df)) - end + df_columns = df isa DataFrame ? _columns(df) : collect(AbstractVector, eachcol(df)) # What column indices should disallowmissing be applied to cols_inds = index(df)[cols] @@ -1010,11 +1006,8 @@ julia> dropmissing(df, [:x, :y]) end end end - # Output will be DataFrame even if SubDataFrame is provided - # (assuming that view=false means user wants a DataFrame) newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false) - # technically, disallowmissing! drops all nonnote metadata but we never copied it, so no need to drop _copy_all_note_metadata!(newdf, df) return newdf end From dcfa4e178dbf755a82894cbb52b8d9fcafdae234 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 07:34:36 +0000 Subject: [PATCH 11/40] update view format --- src/abstractdataframe/abstractdataframe.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index d54312d4b6..7f97d59762 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -992,7 +992,7 @@ julia> dropmissing(df, [:x, :y]) @sync for i in eachindex(new_columns) # for each column, check if disallowmissing should be applied Threads.@spawn if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows]) + new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows)) else new_columns[i] = df_columns[i][selected_rows] end @@ -1000,7 +1000,7 @@ julia> dropmissing(df, [:x, :y]) else for i in eachindex(new_columns) if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows]) + new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows)) else new_columns[i] = df_columns[i][selected_rows] end From 9b4478f794eb770ec5a039f4fa376b286d2a61ca Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 07:53:01 +0000 Subject: [PATCH 12/40] Change back to @view, due to errors with `view()` --- src/abstractdataframe/abstractdataframe.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 7f97d59762..d4395c6cc1 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -992,7 +992,7 @@ julia> dropmissing(df, [:x, :y]) @sync for i in eachindex(new_columns) # for each column, check if disallowmissing should be applied Threads.@spawn if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows)) + new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows])) else new_columns[i] = df_columns[i][selected_rows] end @@ -1000,7 +1000,7 @@ julia> dropmissing(df, [:x, :y]) else for i in eachindex(new_columns) if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows)) + new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows])) else new_columns[i] = df_columns[i][selected_rows] end From afef17349ae43845d9a60f9e5eb7def0fd76a00c Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 10:20:54 +0000 Subject: [PATCH 13/40] import view() explicitly --- src/abstractdataframe/abstractdataframe.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index d4395c6cc1..910f4acfc7 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -992,7 +992,8 @@ julia> dropmissing(df, [:x, :y]) @sync for i in eachindex(new_columns) # for each column, check if disallowmissing should be applied Threads.@spawn if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows])) + # implicit imports of functions due to name clash with keyword arguments! + new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows)) else new_columns[i] = df_columns[i][selected_rows] end @@ -1000,7 +1001,8 @@ julia> dropmissing(df, [:x, :y]) else for i in eachindex(new_columns) if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows])) + # implicit imports of functions due to name clash with keyword arguments! + new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows)) else new_columns[i] = df_columns[i][selected_rows] end From fc643bbb34b58ec739756c6510ed72379f736338 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 10:32:03 +0000 Subject: [PATCH 14/40] add note on dropmissing parallelism I did not mention the heuristic rule about number of rows to make future updates easier --- docs/src/lib/functions.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 9b9de28471..95989e0c85 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -25,6 +25,8 @@ This is a list of operations that currently make use of multi-threading: * a transformation produces one row per group and the passed transformation is a custom function (i.e. not for standard reductions, which use optimized single-threaded methods). +- `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` + (subset-ing into a column without `Missing` is spawned in a separate task for each column). In general at least Julia 1.4 is required to ensure that multi-threading is used and the Julia process must be started with more than one thread. Some operations From d8248a4fb2eaf5adf49d9b352ba73ba7ced69531 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 10:48:11 +0000 Subject: [PATCH 15/40] added test for zero-column and CategoricalArrays --- test/data.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/data.jl b/test/data.jl index 3399ad35e7..610bbaf9a5 100644 --- a/test/data.jl +++ b/test/data.jl @@ -168,6 +168,10 @@ end @test df1b == df1 end + # Zero column case + isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame + @test_throws ArgumentError dropmissing(df1,[]) + df = DataFrame(a=[1, missing, 3]) sdf = view(df, :, :) @test dropmissing(sdf) == DataFrame(a=[1, 3]) @@ -200,6 +204,13 @@ end df = DataFrame(b=b) @test eltype(dropmissing(df).b) == Int @test eltype(dropmissing!(df).b) == Int + + # CategoricalArrays + c = Union{Int64, Missing}[1,2,1,missing]|>categorical + df = DataFrame(c=c) + @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1])) + @test eltype(dropmissing(df).c) == CategoricalValue{Int64, UInt32} + @test eltype(dropmissing!(df).c) == CategoricalValue{Int64, UInt32} end @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin From b63ee3700ac238cac61b1a5ed041c3f55918e977 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:00:40 +0000 Subject: [PATCH 16/40] added test for view=true in dropmissing --- test/data.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/data.jl b/test/data.jl index 610bbaf9a5..c8327c33e4 100644 --- a/test/data.jl +++ b/test/data.jl @@ -190,6 +190,16 @@ end @test eltype(df2.a) == Union{Int, Missing} @test df.a == df2.a == [1, 3] + # view=true + df = DataFrame(a=[1, missing, 3]) + @test dropmissing(df, view=false) == DataFrame(a=[1, 3]) + @test dropmissing(df, view=true) == view(df, [1, 3], :) + @test typeof(dropmissing(df, view=true)) <: SubDataFrame + @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing} + @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) + @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing} + @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int + a = [1, 2] df = DataFrame(a=a, copycols=false) @test dropmissing!(df) === df From bd343a33887611704b4eaa489564baeee7074ccf Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:02:11 +0000 Subject: [PATCH 17/40] add case for dropmissing! --- test/data.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index c8327c33e4..22e982d036 100644 --- a/test/data.jl +++ b/test/data.jl @@ -170,7 +170,9 @@ end # Zero column case isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame - @test_throws ArgumentError dropmissing(df1,[]) + isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame + @test_throws ArgumentError dropmissing(df1,[]) + @test_throws ArgumentError dropmissing!(df1,[]) df = DataFrame(a=[1, missing, 3]) sdf = view(df, :, :) From 16c278d7ce597d0be612ae4c63d291c82b3feb23 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:06:29 +0000 Subject: [PATCH 18/40] add disallowmissing=false test --- test/data.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/data.jl b/test/data.jl index 22e982d036..e5bdaded82 100644 --- a/test/data.jl +++ b/test/data.jl @@ -217,6 +217,12 @@ end @test eltype(dropmissing(df).b) == Int @test eltype(dropmissing!(df).b) == Int + # disallowmissing=false + b = Union{Int, Missing}[1, 2] + df = DataFrame(b=b) + @test eltype(dropmissing(df, disallowmissing=false).b) == Union{Int, Missing} + @test eltype(dropmissing!(df, disallowmissing=false).b) == Union{Int, Missing} + # CategoricalArrays c = Union{Int64, Missing}[1,2,1,missing]|>categorical df = DataFrame(c=c) From eb998cdb957d9b1a6bf4aab607bd9a787d9a4267 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:12:38 +0000 Subject: [PATCH 19/40] removed duplicative test case --- test/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index e5bdaded82..d2c5f85e33 100644 --- a/test/data.jl +++ b/test/data.jl @@ -198,7 +198,7 @@ end @test dropmissing(df, view=true) == view(df, [1, 3], :) @test typeof(dropmissing(df, view=true)) <: SubDataFrame @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing} - @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) + # @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) # tested separately @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing} @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int From 20526dd39901f0016f3b56aee0a56862a4e91aa9 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:20:08 +0000 Subject: [PATCH 20/40] add test case that disallowmissing=true targets only the required column --- test/data.jl | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/test/data.jl b/test/data.jl index d2c5f85e33..1efcfc5263 100644 --- a/test/data.jl +++ b/test/data.jl @@ -218,10 +218,17 @@ end @test eltype(dropmissing!(df).b) == Int # disallowmissing=false + a = Union{Int, Missing}[3, 4] b = Union{Int, Missing}[1, 2] - df = DataFrame(b=b) - @test eltype(dropmissing(df, disallowmissing=false).b) == Union{Int, Missing} - @test eltype(dropmissing!(df, disallowmissing=false).b) == Union{Int, Missing} + df = DataFrame(;a,b) + @test eltype(dropmissing(df, disallowmissing=false).a) == Union{Int, Missing} + @test eltype(dropmissing!(copy(df), disallowmissing=false).a) == Union{Int, Missing} + @test eltype(dropmissing(df, disallowmissing=true).a) == Int + @test eltype(dropmissing!(copy(df), disallowmissing=true).a) == Int + @test eltype(dropmissing(df, :a, disallowmissing=true).a) == Int + @test eltype(dropmissing!(copy(df), :a, disallowmissing=true).a) == Int + @test eltype(dropmissing(df, :b, disallowmissing=true).a) == Union{Int, Missing} + @test eltype(dropmissing!(copy(df), :b, disallowmissing=true).a) == Union{Int, Missing} # CategoricalArrays c = Union{Int64, Missing}[1,2,1,missing]|>categorical From 13deb843a20d650818cc4e681940c97a4c1f8845 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:28:28 +0000 Subject: [PATCH 21/40] simplified the docstring --- docs/src/lib/functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 95989e0c85..7ab48ea779 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -26,7 +26,7 @@ This is a list of operations that currently make use of multi-threading: is a custom function (i.e. not for standard reductions, which use optimized single-threaded methods). - `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` - (subset-ing into a column without `Missing` is spawned in a separate task for each column). + (subset-ing of individual columns is spawned in separate tasks). In general at least Julia 1.4 is required to ensure that multi-threading is used and the Julia process must be started with more than one thread. Some operations From fdc8e60970483ed43b171340bc64869e90ca3dbb Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 19:58:18 +0000 Subject: [PATCH 22/40] Update docs/src/lib/functions.md Co-authored-by: Milan Bouchet-Valat --- docs/src/lib/functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 7ab48ea779..5217df4835 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -26,7 +26,7 @@ This is a list of operations that currently make use of multi-threading: is a custom function (i.e. not for standard reductions, which use optimized single-threaded methods). - `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` - (subset-ing of individual columns is spawned in separate tasks). + (subsetting of individual columns is spawned in separate tasks). In general at least Julia 1.4 is required to ensure that multi-threading is used and the Julia process must be started with more than one thread. Some operations From 504e4cd93e23351c36888fad53c2f082ab068afe Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 19:59:31 +0000 Subject: [PATCH 23/40] Update test/data.jl Co-authored-by: Milan Bouchet-Valat --- test/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index 1efcfc5263..d38ab046ca 100644 --- a/test/data.jl +++ b/test/data.jl @@ -231,7 +231,7 @@ end @test eltype(dropmissing!(copy(df), :b, disallowmissing=true).a) == Union{Int, Missing} # CategoricalArrays - c = Union{Int64, Missing}[1,2,1,missing]|>categorical + c = categorical([1, 2, 1, missing]) df = DataFrame(c=c) @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1])) @test eltype(dropmissing(df).c) == CategoricalValue{Int64, UInt32} From c004a912cdf6225e5b25241747b99e23cb2f949e Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 19:59:53 +0000 Subject: [PATCH 24/40] Update src/abstractdataframe/abstractdataframe.jl Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/abstractdataframe.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 910f4acfc7..0a173592b0 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -992,7 +992,6 @@ julia> dropmissing(df, [:x, :y]) @sync for i in eachindex(new_columns) # for each column, check if disallowmissing should be applied Threads.@spawn if disallowmissing && (i in cols_inds) - # implicit imports of functions due to name clash with keyword arguments! new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows)) else new_columns[i] = df_columns[i][selected_rows] From b895692ea02af4453a294adae929c7db3dbed758 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 20:00:11 +0000 Subject: [PATCH 25/40] Update src/abstractdataframe/abstractdataframe.jl Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/abstractdataframe.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 0a173592b0..ab58faa884 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -992,7 +992,7 @@ julia> dropmissing(df, [:x, :y]) @sync for i in eachindex(new_columns) # for each column, check if disallowmissing should be applied Threads.@spawn if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows)) + new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows)) else new_columns[i] = df_columns[i][selected_rows] end From ba2760aa1b5fc48289f536811b280c6770a4ca45 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 20:00:28 +0000 Subject: [PATCH 26/40] Update src/other/utils.jl Co-authored-by: Milan Bouchet-Valat --- src/other/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/other/utils.jl b/src/other/utils.jl index 1f2deac792..dedd6ba997 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -476,4 +476,4 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}} end end @assert false "should not be reached" -end \ No newline at end of file +end From d0f11879e4cc00536830f93ebfe6abf6c8d0f201 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 20:00:50 +0000 Subject: [PATCH 27/40] Update test/data.jl Co-authored-by: Milan Bouchet-Valat --- test/data.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data.jl b/test/data.jl index d38ab046ca..1c2325fe40 100644 --- a/test/data.jl +++ b/test/data.jl @@ -171,8 +171,8 @@ end # Zero column case isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame - @test_throws ArgumentError dropmissing(df1,[]) - @test_throws ArgumentError dropmissing!(df1,[]) + @test_throws ArgumentError dropmissing(df1, []) + @test_throws ArgumentError dropmissing!(df1, []) df = DataFrame(a=[1, missing, 3]) sdf = view(df, :, :) From 8c5198545646e0ae83517b867d4666a572f063f2 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 20:02:47 +0000 Subject: [PATCH 28/40] Update test/data.jl Co-authored-by: Milan Bouchet-Valat --- test/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index 1c2325fe40..f9c0d97b03 100644 --- a/test/data.jl +++ b/test/data.jl @@ -217,7 +217,7 @@ end @test eltype(dropmissing(df).b) == Int @test eltype(dropmissing!(df).b) == Int - # disallowmissing=false + # disallowmissing argument a = Union{Int, Missing}[3, 4] b = Union{Int, Missing}[1, 2] df = DataFrame(;a,b) From 267e5662c2b95a4c9f6623ccf1c8945006007b79 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 22 Jan 2023 20:03:13 +0000 Subject: [PATCH 29/40] Update test/data.jl Co-authored-by: Milan Bouchet-Valat --- test/data.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data.jl b/test/data.jl index f9c0d97b03..a37cc02325 100644 --- a/test/data.jl +++ b/test/data.jl @@ -234,8 +234,8 @@ end c = categorical([1, 2, 1, missing]) df = DataFrame(c=c) @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1])) - @test eltype(dropmissing(df).c) == CategoricalValue{Int64, UInt32} - @test eltype(dropmissing!(df).c) == CategoricalValue{Int64, UInt32} + @test eltype(dropmissing(df).c) == CategoricalValue{Int, UInt32} + @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32} end @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin From 94b091aa3e7c5729aa091759bbfc35f280760d39 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Mon, 23 Jan 2023 07:07:24 +0000 Subject: [PATCH 30/40] use `@spawn_or_run` to avoid duplication --- src/abstractdataframe/abstractdataframe.jl | 27 +++++++--------------- test/data.jl | 2 +- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index ab58faa884..262d7816ff 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -986,27 +986,16 @@ julia> dropmissing(df, [:x, :y]) # What column indices should disallowmissing be applied to cols_inds = index(df)[cols] - - # Threading decision rule borrowed from `_threaded_getindex` - if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000 - @sync for i in eachindex(new_columns) - # for each column, check if disallowmissing should be applied - Threads.@spawn if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows)) - else - new_columns[i] = df_columns[i][selected_rows] - end - end - else - for i in eachindex(new_columns) - if disallowmissing && (i in cols_inds) - # implicit imports of functions due to name clash with keyword arguments! - new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows)) - else - new_columns[i] = df_columns[i][selected_rows] - end + + use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000 + @sync for i in eachindex(new_columns) + @spawn_or_run use_threads if disallowmissing && (i in cols_inds) + new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows)) + else + new_columns[i] = df_columns[i][selected_rows] end end + newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false) _copy_all_note_metadata!(newdf, df) diff --git a/test/data.jl b/test/data.jl index a37cc02325..c4c0d46a5d 100644 --- a/test/data.jl +++ b/test/data.jl @@ -198,7 +198,7 @@ end @test dropmissing(df, view=true) == view(df, [1, 3], :) @test typeof(dropmissing(df, view=true)) <: SubDataFrame @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing} - # @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) # tested separately + @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing} @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int From fa288f6d933fcbfc3b59508f73b2f6c15588e184 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Mon, 23 Jan 2023 07:37:43 +0000 Subject: [PATCH 31/40] add test for multithreaded path --- test/data.jl | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/data.jl b/test/data.jl index c4c0d46a5d..d289f28584 100644 --- a/test/data.jl +++ b/test/data.jl @@ -236,6 +236,26 @@ end @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1])) @test eltype(dropmissing(df).c) == CategoricalValue{Int, UInt32} @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32} + + # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000) + N_rows, N_cols = 10^7, 4 + df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing + # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing + for i in 1:ncol(df) + missing_mask = (eachindex(df[!,i]) .% 10) .== i + df[missing_mask,i] .= missing + end + + notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)] + @test isequal(dropmissing(df), df[notmissing_rows, :]) + + cols = [:x1, :x2] + notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)] + returned = dropmissing(df, cols) + @test isequal(returned, df[notmissing_rows, :]) + @test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]])) + @test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]])) + @test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)]) end @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin From 5172e72fe7c6e2345d245d8c125c0bb35f2a63b9 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Mon, 23 Jan 2023 13:11:37 +0000 Subject: [PATCH 32/40] Reduced test case memory requirement --- test/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index d289f28584..be05e08785 100644 --- a/test/data.jl +++ b/test/data.jl @@ -238,7 +238,7 @@ end @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32} # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000) - N_rows, N_cols = 10^7, 4 + N_rows, N_cols = 200_000, 3 df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing for i in 1:ncol(df) From 301bdea4cd2eeb9590a3ab0c91a450c8c438b317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 24 Jan 2023 16:47:45 +0100 Subject: [PATCH 33/40] Apply suggestions from code review --- docs/src/lib/functions.md | 2 +- src/abstractdataframe/abstractdataframe.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 5217df4835..625cf60665 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -25,7 +25,7 @@ This is a list of operations that currently make use of multi-threading: * a transformation produces one row per group and the passed transformation is a custom function (i.e. not for standard reductions, which use optimized single-threaded methods). -- `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` +- `dropmissing` when the provided data frame has more than 1 column and `view=false` (subsetting of individual columns is spawned in separate tasks). In general at least Julia 1.4 is required to ensure that multi-threading is used diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 262d7816ff..66f8b4d42c 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -978,7 +978,7 @@ julia> dropmissing(df, [:x, :y]) end return Base.view(df, rowidxs, :) else - # Faster when there are many columns (Indexing with Integers than via Bool mask) + # Faster when there are many columns (indexing with integers than via Bool mask) # or when there are many Missing (as we skip a lot of iterations) selected_rows = _findall(rowidxs) new_columns = Vector{AbstractVector}(undef, ncol(df)) @@ -996,7 +996,7 @@ julia> dropmissing(df, [:x, :y]) end end - newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false) + newdf = DataFrame(new_columns, copy(index(df)), copycols=false) _copy_all_note_metadata!(newdf, df) return newdf From c6477bf893508833625eb3f938a4ef870dbbf8c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 24 Jan 2023 16:49:58 +0100 Subject: [PATCH 34/40] Update src/abstractdataframe/abstractdataframe.jl --- src/abstractdataframe/abstractdataframe.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 66f8b4d42c..e0c40f751d 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -981,18 +981,17 @@ julia> dropmissing(df, [:x, :y]) # Faster when there are many columns (indexing with integers than via Bool mask) # or when there are many Missing (as we skip a lot of iterations) selected_rows = _findall(rowidxs) - new_columns = Vector{AbstractVector}(undef, ncol(df)) - df_columns = df isa DataFrame ? _columns(df) : collect(AbstractVector, eachcol(df)) + new_columns = df isa DataFrame ? copy(_columns(df)) : collect(AbstractVector, eachcol(df)) # What column indices should disallowmissing be applied to cols_inds = index(df)[cols] use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000 - @sync for i in eachindex(new_columns) + @sync for (i, col) in enumerate(new_columns) @spawn_or_run use_threads if disallowmissing && (i in cols_inds) - new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows)) + new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows)) else - new_columns[i] = df_columns[i][selected_rows] + new_columns[i] = col[selected_rows] end end From 3d82b90fbfd8562bfd40b6e198e411d44be52fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 24 Jan 2023 16:52:43 +0100 Subject: [PATCH 35/40] Apply suggestions from code review --- src/abstractdataframe/abstractdataframe.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index e0c40f751d..b1d932e0a3 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -981,13 +981,13 @@ julia> dropmissing(df, [:x, :y]) # Faster when there are many columns (indexing with integers than via Bool mask) # or when there are many Missing (as we skip a lot of iterations) selected_rows = _findall(rowidxs) - new_columns = df isa DataFrame ? copy(_columns(df)) : collect(AbstractVector, eachcol(df)) + new_columns = Vector{AbstractVector}(undef, ncol(df)) # What column indices should disallowmissing be applied to cols_inds = index(df)[cols] use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000 - @sync for (i, col) in enumerate(new_columns) + @sync for (i, col) in enumerate(eachcol(df)) @spawn_or_run use_threads if disallowmissing && (i in cols_inds) new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows)) else From f26c99a36e0dbcaf1ada9b390dbd0f00ed1e08ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 24 Jan 2023 17:03:20 +0100 Subject: [PATCH 36/40] Apply suggestions from code review --- test/data.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/test/data.jl b/test/data.jl index be05e08785..dd8fd94190 100644 --- a/test/data.jl +++ b/test/data.jl @@ -169,8 +169,12 @@ end end # Zero column case - isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame - isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame + @test isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame + @test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame + df = DataFrame(a=1:3, b=4:6) + dfv = @view df[:, 2:1] + @test isempty(dropmissing(dfv) && dropmissing(dfv) isa DataFrame + @test_throws ArgumentError dropmissing!(DataFrame()) @test_throws ArgumentError dropmissing(df1, []) @test_throws ArgumentError dropmissing!(df1, []) @@ -238,12 +242,12 @@ end @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32} # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000) - N_rows, N_cols = 200_000, 3 + N_rows, N_cols = 110_000, 3 df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing for i in 1:ncol(df) - missing_mask = (eachindex(df[!,i]) .% 10) .== i - df[missing_mask,i] .= missing + missing_mask = (eachindex(df[!, i]) .% 10) .== i + df[missing_mask, i] .= missing end notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)] From fffda53be958608cc8881bdb5484bf1fc76dd645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 24 Jan 2023 19:17:59 +0100 Subject: [PATCH 37/40] Update test/data.jl --- test/data.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index dd8fd94190..33d7d082bb 100644 --- a/test/data.jl +++ b/test/data.jl @@ -173,7 +173,8 @@ end @test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame df = DataFrame(a=1:3, b=4:6) dfv = @view df[:, 2:1] - @test isempty(dropmissing(dfv) && dropmissing(dfv) isa DataFrame + # TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved + # @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame @test_throws ArgumentError dropmissing!(DataFrame()) @test_throws ArgumentError dropmissing(df1, []) @test_throws ArgumentError dropmissing!(df1, []) From 82f193cb6a54e0b5f558426653038c310ae0a71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 24 Jan 2023 21:46:51 +0100 Subject: [PATCH 38/40] Update test/data.jl --- test/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data.jl b/test/data.jl index 33d7d082bb..92f9deb7d1 100644 --- a/test/data.jl +++ b/test/data.jl @@ -175,7 +175,7 @@ end dfv = @view df[:, 2:1] # TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved # @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame - @test_throws ArgumentError dropmissing!(DataFrame()) + @test_throws ArgumentError dropmissing!(dfv) @test_throws ArgumentError dropmissing(df1, []) @test_throws ArgumentError dropmissing!(df1, []) From 1426e3ac0ae98520422bfa3522e786fb6d43c59b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 26 Jan 2023 16:40:57 +0100 Subject: [PATCH 39/40] Apply suggestions from code review --- src/abstractdataframe/abstractdataframe.jl | 4 ++-- test/data.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index b1d932e0a3..1891b8262a 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -979,12 +979,12 @@ julia> dropmissing(df, [:x, :y]) return Base.view(df, rowidxs, :) else # Faster when there are many columns (indexing with integers than via Bool mask) - # or when there are many Missing (as we skip a lot of iterations) + # or when there are few Missing (as we skip a lot of iterations) selected_rows = _findall(rowidxs) new_columns = Vector{AbstractVector}(undef, ncol(df)) # What column indices should disallowmissing be applied to - cols_inds = index(df)[cols] + cols_inds = BitSet(index(df)[cols]) use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000 @sync for (i, col) in enumerate(eachcol(df)) diff --git a/test/data.jl b/test/data.jl index 92f9deb7d1..02d953dd59 100644 --- a/test/data.jl +++ b/test/data.jl @@ -252,12 +252,12 @@ end end notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)] - @test isequal(dropmissing(df), df[notmissing_rows, :]) + @test dropmissing(df) ≅ df[notmissing_rows, :] cols = [:x1, :x2] notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)] returned = dropmissing(df, cols) - @test isequal(returned, df[notmissing_rows, :]) + @test returned ≅ df[notmissing_rows, :] @test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]])) @test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]])) @test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)]) From 39580a23725c05fb293444a88c065a254d4ee3dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 27 Jan 2023 10:13:22 +0100 Subject: [PATCH 40/40] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/abstractdataframe.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 1891b8262a..4d7db3c5bc 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -970,7 +970,7 @@ julia> dropmissing(df, [:x, :y]) @inline function dropmissing(df::AbstractDataFrame, cols::Union{ColumnIndex, MultiColumnIndex}=:; view::Bool=false, disallowmissing::Bool=!view) - # Identify Bool mask of which rows have no Missings + # Identify Bool mask of which rows have no missings rowidxs = completecases(df, cols) if view if disallowmissing @@ -979,7 +979,7 @@ julia> dropmissing(df, [:x, :y]) return Base.view(df, rowidxs, :) else # Faster when there are many columns (indexing with integers than via Bool mask) - # or when there are few Missing (as we skip a lot of iterations) + # or when there are many missings (as we skip a lot of iterations) selected_rows = _findall(rowidxs) new_columns = Vector{AbstractVector}(undef, ncol(df))