skorch-dev · BenjaminBossan · May 24, 2022 · May 21, 2022 · May 21, 2022 · May 22, 2022
diff --git a/CHANGES.md b/CHANGES.md
@@ -14,8 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 - Initialize data loaders for training and validation dataset once per fit call instead of once per epoch ([migration guide](https://skorch.readthedocs.io/en/stable/user/FAQ.html#migration-from-0-11-to-0-12))
+- It is now possible to call `np.asarray` with `SliceDataset`s (#858)
 
 ### Fixed
+- Fix a bug in `SliceDataset` that prevented it to be used with `to_numpy` (#858)
 
 ## [0.11.0] - 2021-10-11
 

diff --git a/skorch/helper.py b/skorch/helper.py
@@ -14,6 +14,7 @@
 from skorch.cli import parse_args  # pylint: disable=unused-import
 from skorch.dataset import unpack_data
 from skorch.utils import _make_split
+from skorch.utils import to_numpy
 from skorch.utils import is_torch_data_type
 from skorch.utils import to_tensor
 
@@ -246,6 +247,14 @@ def __getitem__(self, i):
 
         return SliceDataset(self.dataset, idx=self.idx, indices=self.indices_[i])
 
+    def __array__(self, dtype=None):
+        # This method is invoked when calling np.asarray(X)
+        # https://numpy.org/devdocs/user/basics.dispatch.html
+        X = [self[i] for i in range(len(self))]
+        if np.isscalar(X[0]):
+            return np.asarray(X)
+        return np.asarray([to_numpy(x) for x in X], dtype=dtype)
+
 
 def predefined_split(dataset):
     """Uses ``dataset`` for validiation in :class:`.NeuralNet`.

diff --git a/skorch/tests/callbacks/test_all.py b/skorch/tests/callbacks/test_all.py
@@ -46,12 +46,14 @@ def test_on_x_methods_have_kwargs(self, callbacks, on_x_methods):
     def test_set_params_with_unknown_key_raises(self, base_cls):
         with pytest.raises(ValueError) as exc:
             base_cls().set_params(foo=123)
-
-        msg_start = (
-            "Invalid parameter foo for estimator <skorch.callbacks.base.Callback")
-        msg_end = (
-            "Check the list of available parameters with "
-            "`estimator.get_params().keys()`.")
         msg = exc.value.args[0]
-        assert msg.startswith(msg_start)
-        assert msg.endswith(msg_end)
+
+        # message contains "'" around variable name starting from sklearn 1.1
+        assert (
+            msg.startswith(
+                "Invalid parameter foo for estimator <skorch.callbacks.base.Callback"
+            )
+            or msg.startswith(
+                "Invalid parameter 'foo' for estimator <skorch.callbacks.base.Callback"
+            )
+        )
diff --git a/skorch/tests/test_helper.py b/skorch/tests/test_helper.py
@@ -269,10 +269,13 @@ def test_equals_different_keys(self, sldict_cls):
 
 
 class TestSliceDataset:
-    @pytest.fixture(scope='class')
-    def data(self):
+    @pytest.fixture(scope='class', params=['numpy', 'torch'])
+    def data(self, request):
         X, y = make_classification(100, 20, n_informative=10, random_state=0)
-        return X.astype(np.float32), y
+        X = X.astype(np.float32)
+        if request.param == 'numpy':
+            return X, y
+        return torch.from_numpy(X), torch.from_numpy(y)
 
     @pytest.fixture
     def X(self, data):
@@ -331,6 +334,14 @@ def test_len_and_shape(self, slds, y):
         np.array([0, 0, 1, 0] * 25, dtype=np.bool),
     ])
     def test_len_and_shape_sliced(self, slds, y, sl):
+        # torch tensors don't support negative steps, skip test
+        if (
+                isinstance(sl, slice)
+                and (sl == slice(None, None, -1))
+                and isinstance(y, torch.Tensor)
+        ):
+            return
+
         assert len(slds[sl]) == len(y[sl])
         assert slds[sl].shape == (len(y[sl]),)
 
@@ -383,12 +394,18 @@ def test_slice_three_times(self, slds_cls, custom_ds, X, y, sl0, sl1, sl2, n):
         assert np.allclose(sliced, x)
 
     def test_explicitly_pass_indices_at_init(self, slds_cls, custom_ds, X):
+        from skorch.utils import to_numpy
         # test passing indices directy to __init__
         slds = slds_cls(custom_ds, indices=np.arange(10))
         sliced0 = slds[5:]
-        assert np.allclose(sliced0, X[5:10])
-
         sliced1 = sliced0[2]
+
+        # comparison method depends on array type
+        if isinstance(sliced1, torch.Tensor):
+            sliced0 = to_numpy(sliced0)
+            sliced1 = to_numpy(sliced1)
+
+        assert np.allclose(sliced0, X[5:10])
         assert np.allclose(sliced1, X[7])
 
     def test_access_element_out_of_bounds(self, slds_cls, custom_ds):
@@ -425,7 +442,9 @@ def test_grid_search_with_slds_works(
             'lr': [0.01, 0.02],
             'max_epochs': [10, 20],
         }
-        gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')
+        gs = GridSearchCV(
+            net, params, refit=False, cv=3, scoring='accuracy', error_score='raise'
+        )
         gs.fit(slds, y)  # does not raise
 
     def test_grid_search_with_slds_and_internal_split_works(
@@ -438,7 +457,9 @@ def test_grid_search_with_slds_and_internal_split_works(
             'lr': [0.01, 0.02],
             'max_epochs': [10, 20],
         }
-        gs = GridSearchCV(net, params, refit=True, cv=3, scoring='accuracy')
+        gs = GridSearchCV(
+            net, params, refit=True, cv=3, scoring='accuracy', error_score='raise'
+        )
         gs.fit(slds, y)  # does not raise
 
     def test_grid_search_with_slds_X_and_slds_y(
@@ -455,7 +476,9 @@ def test_grid_search_with_slds_X_and_slds_y(
             'lr': [0.01, 0.02],
             'max_epochs': [10, 20],
         }
-        gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')
+        gs = GridSearchCV(
+            net, params, refit=False, cv=3, scoring='accuracy', error_score='raise'
+        )
         gs.fit(slds, slds_y)  # does not raise
 
     def test_index_with_2d_array_raises(self, slds):
@@ -468,6 +491,36 @@ def test_index_with_2d_array_raises(self, slds):
                "dimensional arrays, got 2 dimensions instead.")
         assert exc.value.args[0] == msg
 
+    @pytest.mark.parametrize('n', [0, 1])
+    def test_slicedataset_to_numpy(self, slds_cls, custom_ds, n):
+        from skorch.utils import to_numpy
+
+        slds = slds_cls(custom_ds, idx=n)
+        expected = custom_ds.X if n == 0 else custom_ds.y
+        result = to_numpy(slds)
+        np.testing.assert_array_equal(result, expected)
+
+    @pytest.mark.parametrize('n', [0, 1])
+    @pytest.mark.parametrize('dtype', [None, np.float16, np.int32, np.complex64])
+    def test_slicedataset_asarray(self, slds_cls, custom_ds, n, dtype):
+        torch_to_numpy_dtype_dict = {
+            torch.int64: np.int64,
+            torch.float32: np.float32,
+        }
+
+        slds = slds_cls(custom_ds, idx=n)
+        array = np.asarray(slds, dtype=dtype)
+        expected = custom_ds.X if n == 0 else custom_ds.y
+        assert array.shape == expected.shape
+
+        if dtype is not None:
+            assert array.dtype == dtype
+        else:
+            # if no dtype indicated, use original dtype of the data, or the
+            # numpy equivalent if a torch dtype
+            expected_dtype = torch_to_numpy_dtype_dict.get(expected.dtype, expected.dtype)
+            assert array.dtype == expected_dtype
+
 
 class TestPredefinedSplit():
 

diff --git a/skorch/tests/test_net.py b/skorch/tests/test_net.py
@@ -2202,9 +2202,12 @@ def test_set_params_with_unknown_key_raises(self, net):
         with pytest.raises(ValueError) as exc:
             net.set_params(foo=123)
 
-        # TODO: check error message more precisely, depending on what
-        # the intended message should be from sklearn side
-        assert exc.value.args[0].startswith('Invalid parameter foo for')
+        msg = exc.value.args[0]
+        # message contains "'" around variable name starting from sklearn 1.1
+        assert (
+            msg.startswith("Invalid parameter foo for")
+            or msg.startswith("Invalid parameter 'foo' for")
+        )
 
     @pytest.fixture()
     def sequence_module_cls(self):

diff --git a/skorch/utils.py b/skorch/utils.py
@@ -113,6 +113,11 @@ def to_tensor(X, device, accept_sparse=False):
     raise TypeError("Cannot convert this data type to a torch tensor.")
 
 
+def _is_slicedataset(X):
+    # Cannot use isinstance because we don't want to depend on helper.py.
+    return hasattr(X, 'dataset') and hasattr(X, 'idx') and hasattr(X, 'indices')
+
+
 def to_numpy(X):
     """Generic function to convert a pytorch tensor to numpy.
 
@@ -135,6 +140,9 @@ def to_numpy(X):
     if isinstance(X, (tuple, list)):
         return type(X)(to_numpy(x) for x in X)
 
+    if _is_slicedataset(X):
+        return np.asarray(X)
+
     if not is_torch_data_type(X):
         raise TypeError("Cannot convert this data type to a numpy array.")