Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of dropmissing #3256

Merged
merged 41 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
940c1cd
init
svilupp Dec 23, 2022
e3ae2b0
Merge remote-tracking branch 'upstream/main' into improve-dropmissing…
svilupp Jan 14, 2023
472a715
add dropmissing8 implementation
svilupp Jan 14, 2023
7b5e010
Update src/other/utils.jl
svilupp Jan 15, 2023
dc705e8
move last_ind to separate line, add inbounds
svilupp Jan 15, 2023
f52ed41
support general vectors in `getindex_disallowmissing`
svilupp Jan 15, 2023
72171c9
add general fallback for getindex_disallowmissing
svilupp Jan 15, 2023
4a596ee
method for `getindex_disallowmissing` for abstractdataframe
svilupp Jan 15, 2023
6326f32
changed inner to disallowmissing+view
svilupp Jan 21, 2023
080c018
Clean up
svilupp Jan 21, 2023
a0bf440
updated comments
svilupp Jan 22, 2023
dcfa4e1
update view format
svilupp Jan 22, 2023
9b4478f
Change back to @view, due to errors with `view()`
svilupp Jan 22, 2023
afef173
import view() explicitly
svilupp Jan 22, 2023
fc643bb
add note on dropmissing parallelism
svilupp Jan 22, 2023
d8248a4
added test for zero-column and CategoricalArrays
svilupp Jan 22, 2023
b63ee37
added test for view=true in dropmissing
svilupp Jan 22, 2023
bd343a3
add case for dropmissing!
svilupp Jan 22, 2023
16c278d
add disallowmissing=false test
svilupp Jan 22, 2023
eb998cd
removed duplicative test case
svilupp Jan 22, 2023
20526dd
add test case that disallowmissing=true targets only the required column
svilupp Jan 22, 2023
13deb84
simplified the docstring
svilupp Jan 22, 2023
fdc8e60
Update docs/src/lib/functions.md
svilupp Jan 22, 2023
504e4cd
Update test/data.jl
svilupp Jan 22, 2023
c004a91
Update src/abstractdataframe/abstractdataframe.jl
svilupp Jan 22, 2023
b895692
Update src/abstractdataframe/abstractdataframe.jl
svilupp Jan 22, 2023
ba2760a
Update src/other/utils.jl
svilupp Jan 22, 2023
d0f1187
Update test/data.jl
svilupp Jan 22, 2023
8c51985
Update test/data.jl
svilupp Jan 22, 2023
267e566
Update test/data.jl
svilupp Jan 22, 2023
94b091a
use `@spawn_or_run` to avoid duplication
svilupp Jan 23, 2023
fa288f6
add test for multithreaded path
svilupp Jan 23, 2023
5172e72
Reduced test case memory requirement
svilupp Jan 23, 2023
301bdea
Apply suggestions from code review
bkamins Jan 24, 2023
c6477bf
Update src/abstractdataframe/abstractdataframe.jl
bkamins Jan 24, 2023
3d82b90
Apply suggestions from code review
bkamins Jan 24, 2023
f26c99a
Apply suggestions from code review
bkamins Jan 24, 2023
fffda53
Update test/data.jl
bkamins Jan 24, 2023
82f193c
Update test/data.jl
bkamins Jan 24, 2023
1426e3a
Apply suggestions from code review
bkamins Jan 26, 2023
39580a2
Apply suggestions from code review
bkamins Jan 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,7 @@ julia> dropmissing(df, [:x, :y])
@inline function dropmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex, MultiColumnIndex}=:;
view::Bool=false, disallowmissing::Bool=!view)
# Changes to be made here
rowidxs = completecases(df, cols)
if view
if disallowmissing
Expand All @@ -983,6 +984,36 @@ julia> dropmissing(df, [:x, :y])
end
end

@inline function _getindex_disallowmissing(df::AbstractDataFrame,
row_inds::AbstractVector{T},
col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex}) where {T<:Integer}

newdf = df[row_inds, :]
disallowmissing!(newdf, col_disallowmissing_inds)

return df
end

# new implementation; to be swapped with the above if design is okay
@inline function _dropmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex,MultiColumnIndex}=:;
view::Bool=false, disallowmissing::Bool=!view)
rowidxs = completecases(df, cols)
if view
if disallowmissing
throw(ArgumentError("disallowmissing=true is incompatible with view=true"))
end
return Base.view(df, rowidxs, :)
else
if disallowmissing
newdf = _getindex_disallowmissing(df, rowidxs, cols)
else
newdf = df[rowidxs, :]
end
return newdf
end
end

"""
dropmissing!(df::AbstractDataFrame, cols=:; disallowmissing::Bool=true)

Expand Down
39 changes: 39 additions & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,45 @@ Base.getindex(df::DataFrame, row_ind::Colon, col_inds::MultiColumnIndex) =
Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) =
select(df, index(df)[col_inds], copycols=false)

# Gets a subset of a DataFrame by `row_inds` and disallows missing values in columns specified by `col_disallowmissing_inds`
# Called by dropmissing()
function _getindex_disallowmissing(df::DataFrame, row_inds::AbstractVector{T},
svilupp marked this conversation as resolved.
Show resolved Hide resolved
col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex,Colon}) where {T<:Integer}

# With knowledge of exact indices we can skip a lot of loop iterations (as opposed to filtering by a mask)
selected_rows = T === Bool ? _findall(row_inds) : row_inds
new_columns = Vector{AbstractVector}(undef, ncol(df))
df_columns = DataFrames._columns(df)

# We need to know the exact indices of columns that disallowmissing should be applied to
disallowmissing_columns_inds = index(df)[col_disallowmissing_inds]

# Threading decision rule borrowed from `_threaded_getindex`
if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000
@sync for i in eachindex(new_columns)
# for each column, check if disallowmissing should be applied
Threads.@spawn if i in disallowmissing_columns_inds
@inbounds new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows)
else
@inbounds new_columns[i] = df_columns[i][selected_rows]
end
end
else
@inbounds for i in eachindex(new_columns)
if i in disallowmissing_columns_inds
new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows)
else
new_columns[i] = df_columns[i][selected_rows]
end
end
end

newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
_copy_all_note_metadata!(newdf, df)

return newdf
end

##############################################################################
##
## setindex!()
Expand Down
19 changes: 19 additions & 0 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -477,3 +477,22 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}}
end
@assert false "should not be reached"
end

# Creates a vector like `vect` without the missing type and copies only data at indices in `inds`
# Fuses two operations that were previously done by getindex + disallowmissing
# unsafe because it does not check for `inds` values to be not missing (ensured by the caller)
# Called by `_getindex_disallowmissing(df,...)`
function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
# General fallback, relies on custom vectors' own methods
return vect[inds]|>disallowmissing
svilupp marked this conversation as resolved.
Show resolved Hide resolved
end

function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T}
output = Vector{nonmissingtype(T)}(undef, length(inds))
svilupp marked this conversation as resolved.
Show resolved Hide resolved
last_ind = 0
@inbounds for i in eachindex(inds)
last_ind += 1
output[last_ind] = vect[inds[i]]
end
return output
end
svilupp marked this conversation as resolved.
Show resolved Hide resolved