From 940c1cd006c9b763a0705ca5fc921257d2e394ba Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Fri, 23 Dec 2022 15:45:48 -0500
Subject: [PATCH 01/40] init

---
 src/abstractdataframe/abstractdataframe.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 9fba690d49..7fe9d08c34 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -970,6 +970,7 @@ julia> dropmissing(df, [:x, :y])
 @inline function dropmissing(df::AbstractDataFrame,
                              cols::Union{ColumnIndex, MultiColumnIndex}=:;
                              view::Bool=false, disallowmissing::Bool=!view)
+    # Changes to be made here
     rowidxs = completecases(df, cols)
     if view
         if disallowmissing

From 472a7157d4e7d4d8755b59ba658bb3622776041d Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sat, 14 Jan 2023 18:04:18 +0000
Subject: [PATCH 02/40] add dropmissing8 implementation

---
 src/abstractdataframe/abstractdataframe.jl | 20 +++++++++++
 src/dataframe/dataframe.jl                 | 39 ++++++++++++++++++++++
 src/other/utils.jl                         | 13 ++++++++
 3 files changed, 72 insertions(+)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 7fe9d08c34..1a2a616193 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -984,6 +984,26 @@ julia> dropmissing(df, [:x, :y])
     end
 end
 
+# new implementation; to be swapped with the above if design is okay
+@inline function _dropmissing(df::AbstractDataFrame,
+    cols::Union{ColumnIndex,MultiColumnIndex}=:;
+    view::Bool=false, disallowmissing::Bool=!view)
+    rowidxs = completecases(df, cols)
+    if view
+        if disallowmissing
+            throw(ArgumentError("disallowmissing=true is incompatible with view=true"))
+        end
+        return Base.view(df, rowidxs, :)
+    else
+        if disallowmissing
+            newdf = _getindex_disallowmissing(df, rowidxs, cols)
+        else
+            newdf = df[rowidxs, :]
+        end
+        return newdf
+    end
+end
+
 """
     dropmissing!(df::AbstractDataFrame, cols=:; disallowmissing::Bool=true)
 
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index 01e05fc206..ced4f0a092 100755
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -625,6 +625,45 @@ Base.getindex(df::DataFrame, row_ind::Colon, col_inds::MultiColumnIndex) =
 Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) =
     select(df, index(df)[col_inds], copycols=false)
 
+# Gets a subset of a DataFrame by `row_inds` and disallows missing values in columns specified by `col_disallowmissing_inds` 
+# Called by dropmissing()
+function _getindex_disallowmissing(df::DataFrame, row_inds::AbstractVector{T},
+    col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex,Colon}) where {T<:Integer}
+
+    # With knowledge of exact indices we can skip a lot of loop iterations (as opposed to filtering by a mask)
+    selected_rows = T === Bool ? _findall(row_inds) : row_inds
+    new_columns = Vector{AbstractVector}(undef, ncol(df))
+    df_columns = DataFrames._columns(df)
+
+    # We need to know the exact indices of columns that disallowmissing should be applied to
+    disallowmissing_columns_inds = index(df)[col_disallowmissing_inds]
+
+    # Threading decision rule borrowed from `_threaded_getindex`
+    if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000
+        @sync for i in eachindex(new_columns)
+            # for each column, check if disallowmissing should be applied
+            Threads.@spawn if i in disallowmissing_columns_inds
+                @inbounds new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows)
+            else
+                @inbounds new_columns[i] = df_columns[i][selected_rows]
+            end
+        end
+    else
+        @inbounds for i in eachindex(new_columns)
+            if i in disallowmissing_columns_inds
+                new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows)
+            else
+                new_columns[i] = df_columns[i][selected_rows]
+            end
+        end
+    end
+
+    newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
+    _copy_all_note_metadata!(newdf, df)
+
+    return newdf
+end
+
 ##############################################################################
 ##
 ## setindex!()
diff --git a/src/other/utils.jl b/src/other/utils.jl
index dedd6ba997..2253c6c9b3 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -477,3 +477,16 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}}
     end
     @assert false "should not be reached"
 end
+
+# Creates a vector like `vect` without the missing type and copies only data at indices in `inds`
+# Fuses two operations that were previously done by getindex + disallowmissing
+# unsafe because it does not check for `inds` values to be not missing (ensured by the caller)
+# Called by `_getindex_disallowmissing(df,...)`
+function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
+    output = Vector{nonmissingtype(T)}(undef, length(inds))
+    last_ind = 0
+    for i in eachindex(inds)
+        output[last_ind+=1] = vect[inds[i]]
+    end
+    output
+end
\ No newline at end of file

From 7b5e010fc088b45d68547b08727938b56eacabf3 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 15 Jan 2023 06:51:23 +0000
Subject: [PATCH 03/40] Update src/other/utils.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bogumił Kamiński <bkamins@sgh.waw.pl>
---
 src/other/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/other/utils.jl b/src/other/utils.jl
index 2253c6c9b3..1e9173da3d 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -488,5 +488,5 @@ function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector
     for i in eachindex(inds)
         output[last_ind+=1] = vect[inds[i]]
     end
-    output
+    return output
 end
\ No newline at end of file

From dc705e8b43016705bbfc7e1f3ceb8af3035cb5b5 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 15 Jan 2023 07:13:32 +0000
Subject: [PATCH 04/40] move last_ind to separate line, add inbounds

---
 src/other/utils.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/other/utils.jl b/src/other/utils.jl
index 1e9173da3d..27ebf20cde 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -485,8 +485,9 @@ end
 function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
     output = Vector{nonmissingtype(T)}(undef, length(inds))
     last_ind = 0
-    for i in eachindex(inds)
-        output[last_ind+=1] = vect[inds[i]]
+    @inbounds for i in eachindex(inds)
+        last_ind += 1
+        output[last_ind] = vect[inds[i]]
     end
     return output
 end
\ No newline at end of file

From f52ed417e01d7dae1465d47f198699ea569be539 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 15 Jan 2023 07:25:37 +0000
Subject: [PATCH 05/40] support general vectors in `getindex_disallowmissing`

Uses Tables.allocatecolumn constructor
---
 src/other/utils.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/other/utils.jl b/src/other/utils.jl
index 27ebf20cde..8e027f1ab5 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -483,6 +483,17 @@ end
 # unsafe because it does not check for `inds` values to be not missing (ensured by the caller)
 # Called by `_getindex_disallowmissing(df,...)`
 function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
+    # General constructor; custom scalars can overload `DataAPI.defaultarray` to signal the default array type
+    output = Tables.allocatecolumn(nonmissingtype(T), length(inds))
+    last_ind = 0
+    @inbounds for i in eachindex(inds)
+        last_ind += 1
+        output[last_ind] = vect[inds[i]]
+    end
+    return output
+end
+
+function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T}
     output = Vector{nonmissingtype(T)}(undef, length(inds))
     last_ind = 0
     @inbounds for i in eachindex(inds)

From 72171c97355d447bc9d964721f78d4dfaa4a4f20 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 15 Jan 2023 07:55:04 +0000
Subject: [PATCH 06/40] add general fallback for getindex_disallowmissing

---
 src/other/utils.jl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/other/utils.jl b/src/other/utils.jl
index 8e027f1ab5..01cd80e3d4 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -483,14 +483,8 @@ end
 # unsafe because it does not check for `inds` values to be not missing (ensured by the caller)
 # Called by `_getindex_disallowmissing(df,...)`
 function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
-    # General constructor; custom scalars can overload `DataAPI.defaultarray` to signal the default array type
-    output = Tables.allocatecolumn(nonmissingtype(T), length(inds))
-    last_ind = 0
-    @inbounds for i in eachindex(inds)
-        last_ind += 1
-        output[last_ind] = vect[inds[i]]
-    end
-    return output
+    # General fallback, relies on custom vectors' own methods
+    return vect[inds]|>disallowmissing
 end
 
 function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T}

From 4a596ee0feb0731dc12b43b64c4f2216f273d9e4 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 15 Jan 2023 08:02:11 +0000
Subject: [PATCH 07/40] method for `getindex_disallowmissing` for
 abstractdataframe

---
 src/abstractdataframe/abstractdataframe.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 1a2a616193..c3570c067b 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -984,6 +984,16 @@ julia> dropmissing(df, [:x, :y])
     end
 end
 
+@inline function _getindex_disallowmissing(df::AbstractDataFrame,
+    row_inds::AbstractVector{T},
+    col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex}) where {T<:Integer}
+
+    newdf = df[row_inds, :]
+    disallowmissing!(newdf, col_disallowmissing_inds)
+
+    return df
+end
+
 # new implementation; to be swapped with the above if design is okay
 @inline function _dropmissing(df::AbstractDataFrame,
     cols::Union{ColumnIndex,MultiColumnIndex}=:;

From 6326f32e0c1b32a7f67a9346637c4b9a9206eedc Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sat, 21 Jan 2023 11:18:14 +0000
Subject: [PATCH 08/40] changed inner to disallowmissing+view

---
 src/other/utils.jl | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/other/utils.jl b/src/other/utils.jl
index 01cd80e3d4..4c32bb1070 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -483,16 +483,5 @@ end
 # unsafe because it does not check for `inds` values to be not missing (ensured by the caller)
 # Called by `_getindex_disallowmissing(df,...)`
 function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
-    # General fallback, relies on custom vectors' own methods
-    return vect[inds]|>disallowmissing
-end
-
-function _getindex_disallowmissing(vect::Vector{T}, inds::AbstractVector{<:Integer}) where {T}
-    output = Vector{nonmissingtype(T)}(undef, length(inds))
-    last_ind = 0
-    @inbounds for i in eachindex(inds)
-        last_ind += 1
-        output[last_ind] = vect[inds[i]]
-    end
-    return output
+    return disallowmissing(@view vect[inds])
 end
\ No newline at end of file

From 080c0182ee040a08d1897921ddcd4ba6ac9cabb8 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sat, 21 Jan 2023 17:06:14 +0000
Subject: [PATCH 09/40] Clean up

- merged all pathways into one function for all abstract dataframes
- removed other methods
- calling function disallowmissing explicitly from package Missings, as it others conflicts with the keyword name
---
 src/abstractdataframe/abstractdataframe.jl | 64 ++++++++++++----------
 src/dataframe/dataframe.jl                 | 39 -------------
 src/other/utils.jl                         |  8 ---
 3 files changed, 35 insertions(+), 76 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index c3570c067b..84b768136b 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -970,7 +970,7 @@ julia> dropmissing(df, [:x, :y])
 @inline function dropmissing(df::AbstractDataFrame,
                              cols::Union{ColumnIndex, MultiColumnIndex}=:;
                              view::Bool=false, disallowmissing::Bool=!view)
-    # Changes to be made here
+    # Identify Bool mask of which rows have no Missings
     rowidxs = completecases(df, cols)
     if view
         if disallowmissing
@@ -978,38 +978,44 @@ julia> dropmissing(df, [:x, :y])
         end
         return Base.view(df, rowidxs, :)
     else
-        newdf = df[rowidxs, :]
-        disallowmissing && disallowmissing!(newdf, cols)
-        return newdf
-    end
-end
-
-@inline function _getindex_disallowmissing(df::AbstractDataFrame,
-    row_inds::AbstractVector{T},
-    col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex}) where {T<:Integer}
-
-    newdf = df[row_inds, :]
-    disallowmissing!(newdf, col_disallowmissing_inds)
+        # With exact indices we can skip a lot of iterations (as opposed to indexing via a mask)
+        selected_rows = _findall(rowidxs)
+        new_columns = Vector{AbstractVector}(undef, ncol(df))
+        df_columns = if typeof(df) == DataFrame 
+            _columns(df)
+        else 
+            # for SubDataFrame
+            collect(eachcol(df))
+        end
 
-    return df
-end
+        # What column indices should disallowmissing be applied to
+        cols_inds = index(df)[cols]
 
-# new implementation; to be swapped with the above if design is okay
-@inline function _dropmissing(df::AbstractDataFrame,
-    cols::Union{ColumnIndex,MultiColumnIndex}=:;
-    view::Bool=false, disallowmissing::Bool=!view)
-    rowidxs = completecases(df, cols)
-    if view
-        if disallowmissing
-            throw(ArgumentError("disallowmissing=true is incompatible with view=true"))
-        end
-        return Base.view(df, rowidxs, :)
-    else
-        if disallowmissing
-            newdf = _getindex_disallowmissing(df, rowidxs, cols)
+        # Threading decision rule borrowed from `_threaded_getindex`
+        if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000
+            @sync for i in eachindex(new_columns)
+                # for each column, check if disallowmissing should be applied
+                Threads.@spawn if disallowmissing && (i in cols_inds)
+                    new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows])
+                else
+                    new_columns[i] = df_columns[i][selected_rows]
+                end
+            end
         else
-            newdf = df[rowidxs, :]
+            for i in eachindex(new_columns)
+                if disallowmissing && (i in cols_inds)
+                    new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows])
+                else
+                    new_columns[i] = df_columns[i][selected_rows]
+                end
+            end
         end
+        # Output will be DataFrame even if SubDataFrame is provided
+        # (assuming that view=false means user wants a DataFrame)
+        newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
+
+        # technically, disallowmissing! drops all nonnote metadata but we never copied it, so no need to drop
+        _copy_all_note_metadata!(newdf, df)
         return newdf
     end
 end
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index ced4f0a092..01e05fc206 100755
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -625,45 +625,6 @@ Base.getindex(df::DataFrame, row_ind::Colon, col_inds::MultiColumnIndex) =
 Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) =
     select(df, index(df)[col_inds], copycols=false)
 
-# Gets a subset of a DataFrame by `row_inds` and disallows missing values in columns specified by `col_disallowmissing_inds` 
-# Called by dropmissing()
-function _getindex_disallowmissing(df::DataFrame, row_inds::AbstractVector{T},
-    col_disallowmissing_inds::Union{ColumnIndex,MultiColumnIndex,Colon}) where {T<:Integer}
-
-    # With knowledge of exact indices we can skip a lot of loop iterations (as opposed to filtering by a mask)
-    selected_rows = T === Bool ? _findall(row_inds) : row_inds
-    new_columns = Vector{AbstractVector}(undef, ncol(df))
-    df_columns = DataFrames._columns(df)
-
-    # We need to know the exact indices of columns that disallowmissing should be applied to
-    disallowmissing_columns_inds = index(df)[col_disallowmissing_inds]
-
-    # Threading decision rule borrowed from `_threaded_getindex`
-    if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000
-        @sync for i in eachindex(new_columns)
-            # for each column, check if disallowmissing should be applied
-            Threads.@spawn if i in disallowmissing_columns_inds
-                @inbounds new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows)
-            else
-                @inbounds new_columns[i] = df_columns[i][selected_rows]
-            end
-        end
-    else
-        @inbounds for i in eachindex(new_columns)
-            if i in disallowmissing_columns_inds
-                new_columns[i] = _getindex_disallowmissing(df_columns[i], selected_rows)
-            else
-                new_columns[i] = df_columns[i][selected_rows]
-            end
-        end
-    end
-
-    newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
-    _copy_all_note_metadata!(newdf, df)
-
-    return newdf
-end
-
 ##############################################################################
 ##
 ## setindex!()
diff --git a/src/other/utils.jl b/src/other/utils.jl
index 4c32bb1070..1f2deac792 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -476,12 +476,4 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}}
         end
     end
     @assert false "should not be reached"
-end
-
-# Creates a vector like `vect` without the missing type and copies only data at indices in `inds`
-# Fuses two operations that were previously done by getindex + disallowmissing
-# unsafe because it does not check for `inds` values to be not missing (ensured by the caller)
-# Called by `_getindex_disallowmissing(df,...)`
-function _getindex_disallowmissing(vect::AbstractVector{T}, inds::AbstractVector{<:Integer}) where {T}
-    return disallowmissing(@view vect[inds])
 end
\ No newline at end of file

From a0bf4409a9ecf96934cb595368d970f00b0b997c Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 07:31:38 +0000
Subject: [PATCH 10/40] updated comments

---
 src/abstractdataframe/abstractdataframe.jl | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 84b768136b..d54312d4b6 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -978,15 +978,11 @@ julia> dropmissing(df, [:x, :y])
         end
         return Base.view(df, rowidxs, :)
     else
-        # With exact indices we can skip a lot of iterations (as opposed to indexing via a mask)
+        # Faster when there are many columns (Indexing with Integers than via Bool mask)
+        # or when there are many Missing (as we skip a lot of iterations)
         selected_rows = _findall(rowidxs)
         new_columns = Vector{AbstractVector}(undef, ncol(df))
-        df_columns = if typeof(df) == DataFrame 
-            _columns(df)
-        else 
-            # for SubDataFrame
-            collect(eachcol(df))
-        end
+        df_columns = df isa DataFrame ? _columns(df) : collect(AbstractVector, eachcol(df))
 
         # What column indices should disallowmissing be applied to
         cols_inds = index(df)[cols]
@@ -1010,11 +1006,8 @@ julia> dropmissing(df, [:x, :y])
                 end
             end
         end
-        # Output will be DataFrame even if SubDataFrame is provided
-        # (assuming that view=false means user wants a DataFrame)
         newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
 
-        # technically, disallowmissing! drops all nonnote metadata but we never copied it, so no need to drop
         _copy_all_note_metadata!(newdf, df)
         return newdf
     end

From dcfa4e178dbf755a82894cbb52b8d9fcafdae234 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 07:34:36 +0000
Subject: [PATCH 11/40] update view format

---
 src/abstractdataframe/abstractdataframe.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index d54312d4b6..7f97d59762 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -992,7 +992,7 @@ julia> dropmissing(df, [:x, :y])
             @sync for i in eachindex(new_columns)
                 # for each column, check if disallowmissing should be applied
                 Threads.@spawn if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows])
+                    new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end
@@ -1000,7 +1000,7 @@ julia> dropmissing(df, [:x, :y])
         else
             for i in eachindex(new_columns)
                 if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(@view df_columns[i][selected_rows])
+                    new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end

From 9b4478f794eb770ec5a039f4fa376b286d2a61ca Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 07:53:01 +0000
Subject: [PATCH 12/40] Change back to @view, due to errors with `view()`

---
 src/abstractdataframe/abstractdataframe.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 7f97d59762..d4395c6cc1 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -992,7 +992,7 @@ julia> dropmissing(df, [:x, :y])
             @sync for i in eachindex(new_columns)
                 # for each column, check if disallowmissing should be applied
                 Threads.@spawn if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows))
+                    new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows]))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end
@@ -1000,7 +1000,7 @@ julia> dropmissing(df, [:x, :y])
         else
             for i in eachindex(new_columns)
                 if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(view(df_columns[i],selected_rows))
+                    new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows]))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end

From afef17349ae43845d9a60f9e5eb7def0fd76a00c Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 10:20:54 +0000
Subject: [PATCH 13/40] import view() explicitly

---
 src/abstractdataframe/abstractdataframe.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index d4395c6cc1..910f4acfc7 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -992,7 +992,8 @@ julia> dropmissing(df, [:x, :y])
             @sync for i in eachindex(new_columns)
                 # for each column, check if disallowmissing should be applied
                 Threads.@spawn if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows]))
+                    # implicit imports of functions due to name clash with keyword arguments!
+                    new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end
@@ -1000,7 +1001,8 @@ julia> dropmissing(df, [:x, :y])
         else
             for i in eachindex(new_columns)
                 if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(@view(df_columns[i][selected_rows]))
+                    # implicit imports of functions due to name clash with keyword arguments!
+                    new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end

From fc643bbb34b58ec739756c6510ed72379f736338 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 10:32:03 +0000
Subject: [PATCH 14/40] add note on dropmissing parallelism

I did not mention the heuristic rule about number of rows to make future updates easier
---
 docs/src/lib/functions.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 9b9de28471..95989e0c85 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -25,6 +25,8 @@ This is a list of operations that currently make use of multi-threading:
   * a transformation produces one row per group and the passed transformation
     is a custom function (i.e. not for standard reductions, which use
     optimized single-threaded methods).
+- `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` 
+  (subset-ing into a column without `Missing` is spawned in a separate task for each column).
 
 In general at least Julia 1.4 is required to ensure that multi-threading is used
 and the Julia process must be started with more than one thread. Some operations

From d8248a4fb2eaf5adf49d9b352ba73ba7ced69531 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 10:48:11 +0000
Subject: [PATCH 15/40] added test for zero-column and CategoricalArrays

---
 test/data.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/data.jl b/test/data.jl
index 3399ad35e7..610bbaf9a5 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -168,6 +168,10 @@ end
         @test df1b == df1
     end
 
+    # Zero column case
+    isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame
+    @test_throws ArgumentError dropmissing(df1,[])
+    
     df = DataFrame(a=[1, missing, 3])
     sdf = view(df, :, :)
     @test dropmissing(sdf) == DataFrame(a=[1, 3])
@@ -200,6 +204,13 @@ end
     df = DataFrame(b=b)
     @test eltype(dropmissing(df).b) == Int
     @test eltype(dropmissing!(df).b) == Int
+
+    # CategoricalArrays
+    c = Union{Int64, Missing}[1,2,1,missing]|>categorical
+    df = DataFrame(c=c)
+    @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1]))
+    @test eltype(dropmissing(df).c) == CategoricalValue{Int64, UInt32}
+    @test eltype(dropmissing!(df).c) == CategoricalValue{Int64, UInt32}
 end
 
 @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin

From b63ee3700ac238cac61b1a5ed041c3f55918e977 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:00:40 +0000
Subject: [PATCH 16/40] added test for view=true in dropmissing

---
 test/data.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/data.jl b/test/data.jl
index 610bbaf9a5..c8327c33e4 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -190,6 +190,16 @@ end
     @test eltype(df2.a) == Union{Int, Missing}
     @test df.a == df2.a == [1, 3]
 
+    # view=true
+    df = DataFrame(a=[1, missing, 3])
+    @test dropmissing(df, view=false) == DataFrame(a=[1, 3])
+    @test dropmissing(df, view=true) == view(df, [1, 3], :)
+    @test typeof(dropmissing(df, view=true)) <: SubDataFrame
+    @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing}
+    @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true)
+    @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing}
+    @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int
+
     a = [1, 2]
     df = DataFrame(a=a, copycols=false)
     @test dropmissing!(df) === df

From bd343a33887611704b4eaa489564baeee7074ccf Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:02:11 +0000
Subject: [PATCH 17/40] add case for dropmissing!

---
 test/data.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index c8327c33e4..22e982d036 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -170,7 +170,9 @@ end
 
     # Zero column case
     isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame
-    @test_throws ArgumentError dropmissing(df1,[])
+    isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
+    @test_throws ArgumentError dropmissing(df1,[]) 
+    @test_throws ArgumentError dropmissing!(df1,[]) 
     
     df = DataFrame(a=[1, missing, 3])
     sdf = view(df, :, :)

From 16c278d7ce597d0be612ae4c63d291c82b3feb23 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:06:29 +0000
Subject: [PATCH 18/40] add disallowmissing=false test

---
 test/data.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/data.jl b/test/data.jl
index 22e982d036..e5bdaded82 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -217,6 +217,12 @@ end
     @test eltype(dropmissing(df).b) == Int
     @test eltype(dropmissing!(df).b) == Int
 
+    # disallowmissing=false
+    b = Union{Int, Missing}[1, 2]
+    df = DataFrame(b=b)
+    @test eltype(dropmissing(df, disallowmissing=false).b) == Union{Int, Missing}
+    @test eltype(dropmissing!(df, disallowmissing=false).b) == Union{Int, Missing}
+
     # CategoricalArrays
     c = Union{Int64, Missing}[1,2,1,missing]|>categorical
     df = DataFrame(c=c)

From eb998cdb957d9b1a6bf4aab607bd9a787d9a4267 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:12:38 +0000
Subject: [PATCH 19/40] removed duplicative test case

---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index e5bdaded82..d2c5f85e33 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -198,7 +198,7 @@ end
     @test dropmissing(df, view=true) == view(df, [1, 3], :)
     @test typeof(dropmissing(df, view=true)) <: SubDataFrame
     @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing}
-    @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true)
+    # @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) # tested separately
     @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing}
     @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int
 

From 20526dd39901f0016f3b56aee0a56862a4e91aa9 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:20:08 +0000
Subject: [PATCH 20/40] add test case that disallowmissing=true targets only
 the required column

---
 test/data.jl | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/data.jl b/test/data.jl
index d2c5f85e33..1efcfc5263 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -218,10 +218,17 @@ end
     @test eltype(dropmissing!(df).b) == Int
 
     # disallowmissing=false
+    a = Union{Int, Missing}[3, 4]
     b = Union{Int, Missing}[1, 2]
-    df = DataFrame(b=b)
-    @test eltype(dropmissing(df, disallowmissing=false).b) == Union{Int, Missing}
-    @test eltype(dropmissing!(df, disallowmissing=false).b) == Union{Int, Missing}
+    df = DataFrame(;a,b)
+    @test eltype(dropmissing(df, disallowmissing=false).a) == Union{Int, Missing}
+    @test eltype(dropmissing!(copy(df), disallowmissing=false).a) == Union{Int, Missing}
+    @test eltype(dropmissing(df, disallowmissing=true).a) == Int
+    @test eltype(dropmissing!(copy(df), disallowmissing=true).a) == Int
+    @test eltype(dropmissing(df, :a, disallowmissing=true).a) == Int
+    @test eltype(dropmissing!(copy(df), :a, disallowmissing=true).a) == Int
+    @test eltype(dropmissing(df, :b, disallowmissing=true).a) == Union{Int, Missing}
+    @test eltype(dropmissing!(copy(df), :b, disallowmissing=true).a) == Union{Int, Missing}
 
     # CategoricalArrays
     c = Union{Int64, Missing}[1,2,1,missing]|>categorical

From 13deb843a20d650818cc4e681940c97a4c1f8845 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:28:28 +0000
Subject: [PATCH 21/40] simplified the docstring

---
 docs/src/lib/functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 95989e0c85..7ab48ea779 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -26,7 +26,7 @@ This is a list of operations that currently make use of multi-threading:
     is a custom function (i.e. not for standard reductions, which use
     optimized single-threaded methods).
 - `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` 
-  (subset-ing into a column without `Missing` is spawned in a separate task for each column).
+  (subset-ing of individual columns is spawned in separate tasks).
 
 In general at least Julia 1.4 is required to ensure that multi-threading is used
 and the Julia process must be started with more than one thread. Some operations

From fdc8e60970483ed43b171340bc64869e90ca3dbb Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 19:58:18 +0000
Subject: [PATCH 22/40] Update docs/src/lib/functions.md

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 docs/src/lib/functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 7ab48ea779..5217df4835 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -26,7 +26,7 @@ This is a list of operations that currently make use of multi-threading:
     is a custom function (i.e. not for standard reductions, which use
     optimized single-threaded methods).
 - `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` 
-  (subset-ing of individual columns is spawned in separate tasks).
+  (subsetting of individual columns is spawned in separate tasks).
 
 In general at least Julia 1.4 is required to ensure that multi-threading is used
 and the Julia process must be started with more than one thread. Some operations

From 504e4cd93e23351c36888fad53c2f082ab068afe Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 19:59:31 +0000
Subject: [PATCH 23/40] Update test/data.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index 1efcfc5263..d38ab046ca 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -231,7 +231,7 @@ end
     @test eltype(dropmissing!(copy(df), :b, disallowmissing=true).a) == Union{Int, Missing}
 
     # CategoricalArrays
-    c = Union{Int64, Missing}[1,2,1,missing]|>categorical
+    c = categorical([1, 2, 1, missing])
     df = DataFrame(c=c)
     @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1]))
     @test eltype(dropmissing(df).c) == CategoricalValue{Int64, UInt32}

From c004a912cdf6225e5b25241747b99e23cb2f949e Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 19:59:53 +0000
Subject: [PATCH 24/40] Update src/abstractdataframe/abstractdataframe.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/abstractdataframe.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 910f4acfc7..0a173592b0 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -992,7 +992,6 @@ julia> dropmissing(df, [:x, :y])
             @sync for i in eachindex(new_columns)
                 # for each column, check if disallowmissing should be applied
                 Threads.@spawn if disallowmissing && (i in cols_inds)
-                    # implicit imports of functions due to name clash with keyword arguments!
                     new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows))
                 else
                     new_columns[i] = df_columns[i][selected_rows]

From b895692ea02af4453a294adae929c7db3dbed758 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 20:00:11 +0000
Subject: [PATCH 25/40] Update src/abstractdataframe/abstractdataframe.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/abstractdataframe.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 0a173592b0..ab58faa884 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -992,7 +992,7 @@ julia> dropmissing(df, [:x, :y])
             @sync for i in eachindex(new_columns)
                 # for each column, check if disallowmissing should be applied
                 Threads.@spawn if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows))
+                    new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows))
                 else
                     new_columns[i] = df_columns[i][selected_rows]
                 end

From ba2760aa1b5fc48289f536811b280c6770a4ca45 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 20:00:28 +0000
Subject: [PATCH 26/40] Update src/other/utils.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/other/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/other/utils.jl b/src/other/utils.jl
index 1f2deac792..dedd6ba997 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -476,4 +476,4 @@ function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}}
         end
     end
     @assert false "should not be reached"
-end
\ No newline at end of file
+end

From d0f11879e4cc00536830f93ebfe6abf6c8d0f201 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 20:00:50 +0000
Subject: [PATCH 27/40] Update test/data.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 test/data.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/data.jl b/test/data.jl
index d38ab046ca..1c2325fe40 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -171,8 +171,8 @@ end
     # Zero column case
     isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame
     isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
-    @test_throws ArgumentError dropmissing(df1,[]) 
-    @test_throws ArgumentError dropmissing!(df1,[]) 
+    @test_throws ArgumentError dropmissing(df1, []) 
+    @test_throws ArgumentError dropmissing!(df1, []) 
     
     df = DataFrame(a=[1, missing, 3])
     sdf = view(df, :, :)

From 8c5198545646e0ae83517b867d4666a572f063f2 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 20:02:47 +0000
Subject: [PATCH 28/40] Update test/data.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index 1c2325fe40..f9c0d97b03 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -217,7 +217,7 @@ end
     @test eltype(dropmissing(df).b) == Int
     @test eltype(dropmissing!(df).b) == Int
 
-    # disallowmissing=false
+    # disallowmissing argument
     a = Union{Int, Missing}[3, 4]
     b = Union{Int, Missing}[1, 2]
     df = DataFrame(;a,b)

From 267e5662c2b95a4c9f6623ccf1c8945006007b79 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Sun, 22 Jan 2023 20:03:13 +0000
Subject: [PATCH 29/40] Update test/data.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 test/data.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/data.jl b/test/data.jl
index f9c0d97b03..a37cc02325 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -234,8 +234,8 @@ end
     c = categorical([1, 2, 1, missing])
     df = DataFrame(c=c)
     @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1]))
-    @test eltype(dropmissing(df).c) == CategoricalValue{Int64, UInt32}
-    @test eltype(dropmissing!(df).c) == CategoricalValue{Int64, UInt32}
+    @test eltype(dropmissing(df).c) == CategoricalValue{Int, UInt32}
+    @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32}
 end
 
 @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin

From 94b091aa3e7c5729aa091759bbfc35f280760d39 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Mon, 23 Jan 2023 07:07:24 +0000
Subject: [PATCH 30/40] use `@spawn_or_run` to avoid duplication

---
 src/abstractdataframe/abstractdataframe.jl | 27 +++++++---------------
 test/data.jl                               |  2 +-
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index ab58faa884..262d7816ff 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -986,27 +986,16 @@ julia> dropmissing(df, [:x, :y])
 
         # What column indices should disallowmissing be applied to
         cols_inds = index(df)[cols]
-
-        # Threading decision rule borrowed from `_threaded_getindex`
-        if Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 1_000_000
-            @sync for i in eachindex(new_columns)
-                # for each column, check if disallowmissing should be applied
-                Threads.@spawn if disallowmissing && (i in cols_inds)
-                    new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows))
-                else
-                    new_columns[i] = df_columns[i][selected_rows]
-                end
-            end
-        else
-            for i in eachindex(new_columns)
-                if disallowmissing && (i in cols_inds)
-                    # implicit imports of functions due to name clash with keyword arguments!
-                    new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i],selected_rows))
-                else
-                    new_columns[i] = df_columns[i][selected_rows]
-                end
+        
+        use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
+        @sync for i in eachindex(new_columns)
+            @spawn_or_run use_threads if disallowmissing && (i in cols_inds)
+                new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows))
+            else
+                new_columns[i] = df_columns[i][selected_rows]
             end
         end
+
         newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
 
         _copy_all_note_metadata!(newdf, df)
diff --git a/test/data.jl b/test/data.jl
index a37cc02325..c4c0d46a5d 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -198,7 +198,7 @@ end
     @test dropmissing(df, view=true) == view(df, [1, 3], :)
     @test typeof(dropmissing(df, view=true)) <: SubDataFrame
     @test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing}
-    # @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true) # tested separately
+    @test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true)
     @test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing}
     @test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int
 

From fa288f6d933fcbfc3b59508f73b2f6c15588e184 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Mon, 23 Jan 2023 07:37:43 +0000
Subject: [PATCH 31/40] add test for multithreaded path

---
 test/data.jl | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/data.jl b/test/data.jl
index c4c0d46a5d..d289f28584 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -236,6 +236,26 @@ end
     @test dropmissing(df) == DataFrame(c=categorical([1, 2, 1]))
     @test eltype(dropmissing(df).c) == CategoricalValue{Int, UInt32}
     @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32}
+
+    # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000)
+    N_rows, N_cols = 10^7, 4
+    df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing
+    # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing
+    for i in 1:ncol(df)
+        missing_mask = (eachindex(df[!,i]) .% 10) .== i
+        df[missing_mask,i] .= missing 
+    end
+    
+    notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)]
+    @test isequal(dropmissing(df), df[notmissing_rows, :])
+    
+    cols = [:x1, :x2]
+    notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)]
+    returned = dropmissing(df, cols)
+    @test isequal(returned, df[notmissing_rows, :])
+    @test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]]))
+    @test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]]))
+    @test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)])
 end
 
 @testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin

From 5172e72fe7c6e2345d245d8c125c0bb35f2a63b9 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Mon, 23 Jan 2023 13:11:37 +0000
Subject: [PATCH 32/40] Reduced test case memory requirement

---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index d289f28584..be05e08785 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -238,7 +238,7 @@ end
     @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32}
 
     # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000)
-    N_rows, N_cols = 10^7, 4
+    N_rows, N_cols = 200_000, 3
     df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing
     # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing
     for i in 1:ncol(df)

From 301bdea4cd2eeb9590a3ab0c91a450c8c438b317 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 24 Jan 2023 16:47:45 +0100
Subject: [PATCH 33/40] Apply suggestions from code review

---
 docs/src/lib/functions.md                  | 2 +-
 src/abstractdataframe/abstractdataframe.jl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 5217df4835..625cf60665 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -25,7 +25,7 @@ This is a list of operations that currently make use of multi-threading:
   * a transformation produces one row per group and the passed transformation
     is a custom function (i.e. not for standard reductions, which use
     optimized single-threaded methods).
-- `dropmissing` when the provided `DataFrame` has more than 1 column and `view=false` 
+- `dropmissing` when the provided data frame has more than 1 column and `view=false` 
   (subsetting of individual columns is spawned in separate tasks).
 
 In general at least Julia 1.4 is required to ensure that multi-threading is used
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 262d7816ff..66f8b4d42c 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -978,7 +978,7 @@ julia> dropmissing(df, [:x, :y])
         end
         return Base.view(df, rowidxs, :)
     else
-        # Faster when there are many columns (Indexing with Integers than via Bool mask)
+        # Faster when there are many columns (indexing with integers than via Bool mask)
         # or when there are many Missing (as we skip a lot of iterations)
         selected_rows = _findall(rowidxs)
         new_columns = Vector{AbstractVector}(undef, ncol(df))
@@ -996,7 +996,7 @@ julia> dropmissing(df, [:x, :y])
             end
         end
 
-        newdf = DataFrame(new_columns, copy(DataFrames.index(df)), copycols=false)
+        newdf = DataFrame(new_columns, copy(index(df)), copycols=false)
 
         _copy_all_note_metadata!(newdf, df)
         return newdf

From c6477bf893508833625eb3f938a4ef870dbbf8c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 24 Jan 2023 16:49:58 +0100
Subject: [PATCH 34/40] Update src/abstractdataframe/abstractdataframe.jl

---
 src/abstractdataframe/abstractdataframe.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 66f8b4d42c..e0c40f751d 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -981,18 +981,17 @@ julia> dropmissing(df, [:x, :y])
         # Faster when there are many columns (indexing with integers than via Bool mask)
         # or when there are many Missing (as we skip a lot of iterations)
         selected_rows = _findall(rowidxs)
-        new_columns = Vector{AbstractVector}(undef, ncol(df))
-        df_columns = df isa DataFrame ? _columns(df) : collect(AbstractVector, eachcol(df))
+        new_columns = df isa DataFrame ? copy(_columns(df)) : collect(AbstractVector, eachcol(df))
 
         # What column indices should disallowmissing be applied to
         cols_inds = index(df)[cols]
         
         use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
-        @sync for i in eachindex(new_columns)
+        @sync for (i, col) in enumerate(new_columns)
             @spawn_or_run use_threads if disallowmissing && (i in cols_inds)
-                new_columns[i] = Missings.disallowmissing(Base.view(df_columns[i], selected_rows))
+                new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows))
             else
-                new_columns[i] = df_columns[i][selected_rows]
+                new_columns[i] = col[selected_rows]
             end
         end
 

From 3d82b90fbfd8562bfd40b6e198e411d44be52fb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 24 Jan 2023 16:52:43 +0100
Subject: [PATCH 35/40] Apply suggestions from code review

---
 src/abstractdataframe/abstractdataframe.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index e0c40f751d..b1d932e0a3 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -981,13 +981,13 @@ julia> dropmissing(df, [:x, :y])
         # Faster when there are many columns (indexing with integers than via Bool mask)
         # or when there are many Missing (as we skip a lot of iterations)
         selected_rows = _findall(rowidxs)
-        new_columns = df isa DataFrame ? copy(_columns(df)) : collect(AbstractVector, eachcol(df))
+        new_columns = Vector{AbstractVector}(undef, ncol(df))
 
         # What column indices should disallowmissing be applied to
         cols_inds = index(df)[cols]
         
         use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
-        @sync for (i, col) in enumerate(new_columns)
+        @sync for (i, col) in enumerate(eachcol(df))
             @spawn_or_run use_threads if disallowmissing && (i in cols_inds)
                 new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows))
             else

From f26c99a36e0dbcaf1ada9b390dbd0f00ed1e08ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 24 Jan 2023 17:03:20 +0100
Subject: [PATCH 36/40] Apply suggestions from code review

---
 test/data.jl | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/test/data.jl b/test/data.jl
index be05e08785..dd8fd94190 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -169,8 +169,12 @@ end
     end
 
     # Zero column case
-    isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame
-    isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
+    @test isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame
+    @test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
+    df = DataFrame(a=1:3, b=4:6)
+    dfv = @view df[:, 2:1]
+    @test isempty(dropmissing(dfv) && dropmissing(dfv) isa DataFrame
+    @test_throws ArgumentError dropmissing!(DataFrame())
     @test_throws ArgumentError dropmissing(df1, []) 
     @test_throws ArgumentError dropmissing!(df1, []) 
     
@@ -238,12 +242,12 @@ end
     @test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32}
 
     # Multithreaded execution test (must be at least ncol > 1, nrow > 100_000)
-    N_rows, N_cols = 200_000, 3
+    N_rows, N_cols = 110_000, 3
     df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing
     # Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing
     for i in 1:ncol(df)
-        missing_mask = (eachindex(df[!,i]) .% 10) .== i
-        df[missing_mask,i] .= missing 
+        missing_mask = (eachindex(df[!, i]) .% 10) .== i
+        df[missing_mask, i] .= missing 
     end
     
     notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)]

From fffda53be958608cc8881bdb5484bf1fc76dd645 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 24 Jan 2023 19:17:59 +0100
Subject: [PATCH 37/40] Update test/data.jl

---
 test/data.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index dd8fd94190..33d7d082bb 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -173,7 +173,8 @@ end
     @test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
     df = DataFrame(a=1:3, b=4:6)
     dfv = @view df[:, 2:1]
-    @test isempty(dropmissing(dfv) && dropmissing(dfv) isa DataFrame
+    # TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved
+    # @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
     @test_throws ArgumentError dropmissing!(DataFrame())
     @test_throws ArgumentError dropmissing(df1, []) 
     @test_throws ArgumentError dropmissing!(df1, []) 

From 82f193cb6a54e0b5f558426653038c310ae0a71d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 24 Jan 2023 21:46:51 +0100
Subject: [PATCH 38/40] Update test/data.jl

---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index 33d7d082bb..92f9deb7d1 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -175,7 +175,7 @@ end
     dfv = @view df[:, 2:1]
     # TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved
     # @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
-    @test_throws ArgumentError dropmissing!(DataFrame())
+    @test_throws ArgumentError dropmissing!(dfv)
     @test_throws ArgumentError dropmissing(df1, []) 
     @test_throws ArgumentError dropmissing!(df1, []) 
     

From 1426e3ac0ae98520422bfa3522e786fb6d43c59b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 26 Jan 2023 16:40:57 +0100
Subject: [PATCH 39/40] Apply suggestions from code review

---
 src/abstractdataframe/abstractdataframe.jl | 4 ++--
 test/data.jl                               | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index b1d932e0a3..1891b8262a 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -979,12 +979,12 @@ julia> dropmissing(df, [:x, :y])
         return Base.view(df, rowidxs, :)
     else
         # Faster when there are many columns (indexing with integers than via Bool mask)
-        # or when there are many Missing (as we skip a lot of iterations)
+        # or when there are few Missing (as we skip a lot of iterations)
         selected_rows = _findall(rowidxs)
         new_columns = Vector{AbstractVector}(undef, ncol(df))
 
         # What column indices should disallowmissing be applied to
-        cols_inds = index(df)[cols]
+        cols_inds = BitSet(index(df)[cols])
         
         use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
         @sync for (i, col) in enumerate(eachcol(df))
diff --git a/test/data.jl b/test/data.jl
index 92f9deb7d1..02d953dd59 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -252,12 +252,12 @@ end
     end
     
     notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)]
-    @test isequal(dropmissing(df), df[notmissing_rows, :])
+    @test dropmissing(df) ≅ df[notmissing_rows, :]
     
     cols = [:x1, :x2]
     notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)]
     returned = dropmissing(df, cols)
-    @test isequal(returned, df[notmissing_rows, :])
+    @test returned ≅ df[notmissing_rows, :]
     @test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]]))
     @test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]]))
     @test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)])

From 39580a23725c05fb293444a88c065a254d4ee3dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 27 Jan 2023 10:13:22 +0100
Subject: [PATCH 40/40] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/abstractdataframe.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 1891b8262a..4d7db3c5bc 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -970,7 +970,7 @@ julia> dropmissing(df, [:x, :y])
 @inline function dropmissing(df::AbstractDataFrame,
                              cols::Union{ColumnIndex, MultiColumnIndex}=:;
                              view::Bool=false, disallowmissing::Bool=!view)
-    # Identify Bool mask of which rows have no Missings
+    # Identify Bool mask of which rows have no missings
     rowidxs = completecases(df, cols)
     if view
         if disallowmissing
@@ -979,7 +979,7 @@ julia> dropmissing(df, [:x, :y])
         return Base.view(df, rowidxs, :)
     else
         # Faster when there are many columns (indexing with integers than via Bool mask)
-        # or when there are few Missing (as we skip a lot of iterations)
+        # or when there are many missings (as we skip a lot of iterations)
         selected_rows = _findall(rowidxs)
         new_columns = Vector{AbstractVector}(undef, ncol(df))