Skip to content

Commit

Permalink
Optimize completecases to process only missingable columns and be t…
Browse files Browse the repository at this point in the history
…ype stable (#2726)
  • Loading branch information
pstorozenko authored May 7, 2021
1 parent c3083fc commit bcaa2e5
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
22 changes: 18 additions & 4 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -761,14 +761,28 @@ function completecases(df::AbstractDataFrame, col::Colon=:)
"data frame with no columns"))
end
res = trues(size(df, 1))
aux = BitVector(undef, size(df, 1))
for i in 1:size(df, 2)
res .&= .!ismissing.(df[!, i])
v = df[!, i]
if Missing <: eltype(v)
# Disable fused broadcasting as it happens to be much slower
aux .= .!ismissing.(v)
res .&= aux
end
end
res
return res
end

completecases(df::AbstractDataFrame, col::ColumnIndex) =
.!ismissing.(df[!, col])
function completecases(df::AbstractDataFrame, col::ColumnIndex)
v = df[!, col]
if Missing <: eltype(v)
res = BitVector(undef, size(df, 1))
res .= .!ismissing.(v)
return res
else
return trues(size(df, 1))
end
end

completecases(df::AbstractDataFrame, cols::MultiColumnIndex) =
completecases(df[!, cols])
Expand Down
15 changes: 13 additions & 2 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,17 @@ end
:auto)
df2 = DataFrame([Union{Int, Missing}[1, 2, 3, 4], ["one", "two", missing, "four"]],
:auto)

@test df2[completecases(df2), :] == df2[[1, 2, 4], :]
df3 = DataFrame(x = Int[1, 2, 3, 4], y = Union{Int, Missing}[1, missing, 2, 3],
z = Missing[missing, missing, missing, missing])

@test completecases(df2) == .!ismissing.(df2.x2)
@test @inferred(completecases(df3, :x)) == trues(nrow(df3))
@test completecases(df3, :y) == .!ismissing.(df3.y)
@test completecases(df3, :z) == completecases(df3, [:z, :x]) ==
completecases(df3, [:x, :z]) == completecases(df3, [:y, :x, :z]) ==
falses(nrow(df3))
@test @inferred(completecases(df3, [:y, :x])) ==
completecases(df3, [:x, :y]) == .!ismissing.(df3.y)
@test dropmissing(df2) == df2[[1, 2, 4], :]
returned = dropmissing(df1)
@test df1 == returned && df1 !== returned
Expand All @@ -127,7 +136,9 @@ end
@test df1b == df1

@test_throws ArgumentError completecases(DataFrame())
@test_throws ArgumentError completecases(DataFrame(x=1:3), Cols())
@test_throws MethodError completecases(DataFrame(x=1), true)
@test_throws ArgumentError completecases(df3, :a)

for cols in (:x2, "x2", [:x2], ["x2"], [:x1, :x2], ["x1", "x2"], 2, [2], 1:2,
[true, true], [false, true], :,
Expand Down

0 comments on commit bcaa2e5

Please sign in to comment.