scikit-learn-contrib · glemaitre · Jan 16, 2022 · Sep 2, 2021 · Sep 2, 2021 · Sep 9, 2021
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
@@ -6,6 +6,7 @@ set -x
 UNAMESTR=`uname`
 
 make_conda() {
+    conda update -yq conda
     TO_INSTALL="$@"
     if [[ "$DISTRIB" == *"mamba"* ]]; then
         mamba create -n $VIRTUALENV --yes $TO_INSTALL

diff --git a/doc/developers_utils.rst b/doc/developers_utils.rst
@@ -29,7 +29,8 @@ which accepts arrays, matrices, or sparse matrices as arguments, the following
 should be used when applicable.
 
 - :func:`check_neighbors_object`: Check the objects is consistent to be a NN.
-- :func:`check_target_type`: Check the target types to be conform to the current sam  plers.
+- :func:`check_target_type`: Check the target types to be conform to the current
+  samplers.
 - :func:`check_sampling_strategy`: Checks that sampling target is onsistent with
   the type and return a dictionary containing each targeted class with its
   corresponding number of pixel.

diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst
@@ -5,3 +5,11 @@ Version 0.10.0 (ongoing)
 
 Changelog
 ---------
+
+Enhancements
+............
+
+- Add support to accept compatible `NearestNeighbors` objects by only
+  duck-typing. For instance, it allows to accept cuML instances.
+  :pr:`858` by :user:`NV-jpt <NV-jpt>` and
+  :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
@@ -39,10 +39,17 @@ class ADASYN(BaseOverSampler):
     {random_state}
 
     n_neighbors : int or estimator object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 
@@ -124,7 +131,6 @@ def _validate_estimator(self):
         self.nn_ = check_neighbors_object(
             "n_neighbors", self.n_neighbors, additional_neighbor=1
         )
-        self.nn_.set_params(**{"n_jobs": self.n_jobs})
 
     def _fit_resample(self, X, y):
         self._validate_estimator()

diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -224,10 +224,17 @@ class SMOTE(BaseSMOTE):
     {random_state}
 
     k_neighbors : int or object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 
@@ -367,10 +374,17 @@ class SMOTENC(SMOTE):
     {random_state}
 
     k_neighbors : int or object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 
@@ -636,10 +650,17 @@ class SMOTEN(SMOTE):
     {random_state}
 
     k_neighbors : int or object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 

diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py
@@ -45,10 +45,17 @@ class KMeansSMOTE(BaseSMOTE):
     {random_state}
 
     k_neighbors : int or object, default=2
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 

diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py
@@ -15,7 +15,6 @@
 from sklearn.utils import _safe_indexing
 
 from ..base import BaseOverSampler
-from ...exceptions import raise_isinstance_error
 from ...utils import check_neighbors_object
 from ...utils import Substitution
 from ...utils._docstring import _n_jobs_docstring
@@ -48,18 +47,32 @@ class BorderlineSMOTE(BaseSMOTE):
     {random_state}
 
     k_neighbors : int or object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 
     m_neighbors : int or object, default=10
-        If int, number of nearest neighbours to use to determine if a minority
-        sample is in danger. If object, an estimator that inherits
-        from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used
-        to find the m_neighbors.
+        The nearest neighbors used to determine if a minority sample is in
+        "danger". You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     kind : {{"borderline-1", "borderline-2"}}, default='borderline-1'
         The type of SMOTE algorithm to use one of the following options:
@@ -155,7 +168,6 @@ def _validate_estimator(self):
         self.nn_m_ = check_neighbors_object(
             "m_neighbors", self.m_neighbors, additional_neighbor=1
         )
-        self.nn_m_.set_params(**{"n_jobs": self.n_jobs})
         if self.kind not in ("borderline-1", "borderline-2"):
             raise ValueError(
                 f'The possible "kind" of algorithm are '
@@ -263,21 +275,37 @@ class SVMSMOTE(BaseSMOTE):
     {random_state}
 
     k_neighbors : int or object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
-        samples.  If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the k_neighbors.
+        The nearest neighbors used to define the neighborhood of samples to use
+        to generate the synthetic samples. You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     {n_jobs}
 
     m_neighbors : int or object, default=10
-        If int, number of nearest neighbours to use to determine if a minority
-        sample is in danger. If object, an estimator that inherits from
-        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
-        find the m_neighbors.
+        The nearest neighbors used to determine if a minority sample is in
+        "danger". You can pass:
+
+        - an `int` corresponding to the number of neighbors to use. A
+          `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
+          case.
+        - an instance of a compatible nearest neighbors algorithm that should
+          implement both methods `kneighbors` and `kneighbors_graph`. For
+          instance, it could correspond to a
+          :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
+          any compatible class.
 
     svm_estimator : estimator object, default=SVC()
         A parametrized :class:`~sklearn.svm.SVC` classifier can be passed.
+        A scikit-learn compatible estimator can be passed but it is required
+        to expose a `support_` fitted attribute.
 
     out_step : float, default=0.5
         Step size when extrapolating.
@@ -381,14 +409,11 @@ def _validate_estimator(self):
         self.nn_m_ = check_neighbors_object(
             "m_neighbors", self.m_neighbors, additional_neighbor=1
         )
-        self.nn_m_.set_params(**{"n_jobs": self.n_jobs})
 
         if self.svm_estimator is None:
             self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state)
-        elif isinstance(self.svm_estimator, SVC):
-            self.svm_estimator_ = clone(self.svm_estimator)
         else:
-            raise_isinstance_error("svm_estimator", [SVC], self.svm_estimator)
+            self.svm_estimator_ = clone(self.svm_estimator)
 
     def _fit_resample(self, X, y):
         self._validate_estimator()
@@ -403,6 +428,12 @@ def _fit_resample(self, X, y):
             X_class = _safe_indexing(X, target_class_indices)
 
             self.svm_estimator_.fit(X, y)
+            if not hasattr(self.svm_estimator_, "support_"):
+                raise RuntimeError(
+                    "`svm_estimator` is required to exposed a `support_` fitted "
+                    "attribute. Such estimator belongs to the familly of Support "
+                    "Vector Machine."
+                )
             support_index = self.svm_estimator_.support_[
                 y[self.svm_estimator_.support_] == class_sample
             ]

diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py
@@ -4,15 +4,12 @@
 # License: MIT
 
 import numpy as np
-import pytest
 
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_equal
 from sklearn.neighbors import NearestNeighbors
 
 from imblearn.over_sampling import SMOTE
-from imblearn.over_sampling import SVMSMOTE
-from imblearn.over_sampling import BorderlineSMOTE
 
 
 RND_SEED = 0
@@ -153,14 +150,3 @@ def test_sample_regular_with_nn():
     )
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
-
-
-@pytest.mark.parametrize(
-    "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"]
-)
-def test_smote_m_neighbors(smote):
-    # check that m_neighbors is properly set. Regression test for:
-    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568
-    _ = smote.fit_resample(X, Y)
-    assert smote.nn_k_.n_neighbors == 6
-    assert smote.nn_m_.n_neighbors == 11
diff --git a/imblearn/over_sampling/_smote/tests/test_svm_smote.py b/imblearn/over_sampling/_smote/tests/test_svm_smote.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 
+from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import NearestNeighbors
 from sklearn.svm import SVC
 
@@ -54,3 +55,12 @@ def test_svm_smote(data):
 
     assert_allclose(X_res_1, X_res_2)
     assert_array_equal(y_res_1, y_res_2)
+
+
+def test_svm_smote_not_svm(data):
+    """Check that we raise a proper error if passing an estimator that does not
+    expose a `support_` fitted attribute."""
+
+    err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute."
+    with pytest.raises(RuntimeError, match=err_msg):
+        SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data)
diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py
@@ -131,7 +131,11 @@ def test_ada_fit_resample_nn_obj():
             {"sampling_strategy": {0: 9, 1: 12}},
             "No samples will be generated.",
         ),
-        ({"n_neighbors": "rnd"}, "has to be one of"),
+        (
+            {"n_neighbors": "rnd"},
+            "n_neighbors must be an interger or an object compatible with the "
+            "KNeighborsMixin API of scikit-learn",
+        ),
     ],
 )
 def test_adasyn_error(adasyn_params, err_msg):