Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: sklearn 1.1 and SliceDataset #858

Merged
merged 6 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed
- Initialize data loaders for training and validation dataset once per fit call instead of once per epoch ([migration guide](https://skorch.readthedocs.io/en/stable/user/FAQ.html#migration-from-0-11-to-0-12))
- It is now possible to call `np.asarray` with `SliceDataset`s (#858)

### Fixed
- Fix a bug in `SliceDataset` that prevented it to be used with `to_numpy` (#858)

## [0.11.0] - 2021-10-11

Expand Down
9 changes: 9 additions & 0 deletions skorch/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from skorch.cli import parse_args # pylint: disable=unused-import
from skorch.dataset import unpack_data
from skorch.utils import _make_split
from skorch.utils import to_numpy
from skorch.utils import is_torch_data_type
from skorch.utils import to_tensor

Expand Down Expand Up @@ -246,6 +247,14 @@ def __getitem__(self, i):

return SliceDataset(self.dataset, idx=self.idx, indices=self.indices_[i])

def __array__(self, dtype=None):
# This method is invoked when calling np.asarray(X)
# https://numpy.org/devdocs/user/basics.dispatch.html
X = [self[i] for i in range(len(self))]
if np.isscalar(X[0]):
return np.asarray(X)
return np.asarray([to_numpy(x) for x in X], dtype=dtype)


def predefined_split(dataset):
"""Uses ``dataset`` for validiation in :class:`.NeuralNet`.
Expand Down
18 changes: 10 additions & 8 deletions skorch/tests/callbacks/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@ def test_on_x_methods_have_kwargs(self, callbacks, on_x_methods):
def test_set_params_with_unknown_key_raises(self, base_cls):
with pytest.raises(ValueError) as exc:
base_cls().set_params(foo=123)

msg_start = (
"Invalid parameter foo for estimator <skorch.callbacks.base.Callback")
msg_end = (
"Check the list of available parameters with "
"`estimator.get_params().keys()`.")
msg = exc.value.args[0]
assert msg.startswith(msg_start)
assert msg.endswith(msg_end)

# message contains "'" around variable name starting from sklearn 1.1
assert (
msg.startswith(
"Invalid parameter foo for estimator <skorch.callbacks.base.Callback"
)
or msg.startswith(
"Invalid parameter 'foo' for estimator <skorch.callbacks.base.Callback"
)
)
69 changes: 61 additions & 8 deletions skorch/tests/test_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,13 @@ def test_equals_different_keys(self, sldict_cls):


class TestSliceDataset:
@pytest.fixture(scope='class')
def data(self):
@pytest.fixture(scope='class', params=['numpy', 'torch'])
def data(self, request):
X, y = make_classification(100, 20, n_informative=10, random_state=0)
return X.astype(np.float32), y
X = X.astype(np.float32)
if request.param == 'numpy':
return X, y
return torch.from_numpy(X), torch.from_numpy(y)

@pytest.fixture
def X(self, data):
Expand Down Expand Up @@ -331,6 +334,14 @@ def test_len_and_shape(self, slds, y):
np.array([0, 0, 1, 0] * 25, dtype=np.bool),
])
def test_len_and_shape_sliced(self, slds, y, sl):
# torch tensors don't support negative steps, skip test
if (
isinstance(sl, slice)
and (sl == slice(None, None, -1))
and isinstance(y, torch.Tensor)
):
return

assert len(slds[sl]) == len(y[sl])
assert slds[sl].shape == (len(y[sl]),)

Expand Down Expand Up @@ -383,12 +394,18 @@ def test_slice_three_times(self, slds_cls, custom_ds, X, y, sl0, sl1, sl2, n):
assert np.allclose(sliced, x)

def test_explicitly_pass_indices_at_init(self, slds_cls, custom_ds, X):
from skorch.utils import to_numpy
# test passing indices directy to __init__
slds = slds_cls(custom_ds, indices=np.arange(10))
sliced0 = slds[5:]
assert np.allclose(sliced0, X[5:10])

sliced1 = sliced0[2]

# comparison method depends on array type
if isinstance(sliced1, torch.Tensor):
sliced0 = to_numpy(sliced0)
sliced1 = to_numpy(sliced1)

assert np.allclose(sliced0, X[5:10])
assert np.allclose(sliced1, X[7])

def test_access_element_out_of_bounds(self, slds_cls, custom_ds):
Expand Down Expand Up @@ -425,7 +442,9 @@ def test_grid_search_with_slds_works(
'lr': [0.01, 0.02],
'max_epochs': [10, 20],
}
gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')
gs = GridSearchCV(
net, params, refit=False, cv=3, scoring='accuracy', error_score='raise'
)
gs.fit(slds, y) # does not raise

def test_grid_search_with_slds_and_internal_split_works(
Expand All @@ -438,7 +457,9 @@ def test_grid_search_with_slds_and_internal_split_works(
'lr': [0.01, 0.02],
'max_epochs': [10, 20],
}
gs = GridSearchCV(net, params, refit=True, cv=3, scoring='accuracy')
gs = GridSearchCV(
net, params, refit=True, cv=3, scoring='accuracy', error_score='raise'
)
gs.fit(slds, y) # does not raise

def test_grid_search_with_slds_X_and_slds_y(
Expand All @@ -455,7 +476,9 @@ def test_grid_search_with_slds_X_and_slds_y(
'lr': [0.01, 0.02],
'max_epochs': [10, 20],
}
gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')
gs = GridSearchCV(
net, params, refit=False, cv=3, scoring='accuracy', error_score='raise'
)
gs.fit(slds, slds_y) # does not raise

def test_index_with_2d_array_raises(self, slds):
Expand All @@ -468,6 +491,36 @@ def test_index_with_2d_array_raises(self, slds):
"dimensional arrays, got 2 dimensions instead.")
assert exc.value.args[0] == msg

@pytest.mark.parametrize('n', [0, 1])
def test_slicedataset_to_numpy(self, slds_cls, custom_ds, n):
from skorch.utils import to_numpy

slds = slds_cls(custom_ds, idx=n)
expected = custom_ds.X if n == 0 else custom_ds.y
result = to_numpy(slds)
np.testing.assert_array_equal(result, expected)

@pytest.mark.parametrize('n', [0, 1])
@pytest.mark.parametrize('dtype', [None, np.float16, np.int32, np.complex64])
def test_slicedataset_asarray(self, slds_cls, custom_ds, n, dtype):
torch_to_numpy_dtype_dict = {
torch.int64: np.int64,
torch.float32: np.float32,
}

slds = slds_cls(custom_ds, idx=n)
array = np.asarray(slds, dtype=dtype)
expected = custom_ds.X if n == 0 else custom_ds.y
assert array.shape == expected.shape

if dtype is not None:
assert array.dtype == dtype
else:
# if no dtype indicated, use original dtype of the data, or the
# numpy equivalent if a torch dtype
expected_dtype = torch_to_numpy_dtype_dict.get(expected.dtype, expected.dtype)
assert array.dtype == expected_dtype


class TestPredefinedSplit():

Expand Down
9 changes: 6 additions & 3 deletions skorch/tests/test_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,9 +2202,12 @@ def test_set_params_with_unknown_key_raises(self, net):
with pytest.raises(ValueError) as exc:
net.set_params(foo=123)

# TODO: check error message more precisely, depending on what
# the intended message should be from sklearn side
assert exc.value.args[0].startswith('Invalid parameter foo for')
msg = exc.value.args[0]
# message contains "'" around variable name starting from sklearn 1.1
assert (
msg.startswith("Invalid parameter foo for")
or msg.startswith("Invalid parameter 'foo' for")
)

@pytest.fixture()
def sequence_module_cls(self):
Expand Down
8 changes: 8 additions & 0 deletions skorch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ def to_tensor(X, device, accept_sparse=False):
raise TypeError("Cannot convert this data type to a torch tensor.")


def _is_slicedataset(X):
# Cannot use isinstance because we don't want to depend on helper.py.
return hasattr(X, 'dataset') and hasattr(X, 'idx') and hasattr(X, 'indices')


def to_numpy(X):
"""Generic function to convert a pytorch tensor to numpy.

Expand All @@ -135,6 +140,9 @@ def to_numpy(X):
if isinstance(X, (tuple, list)):
return type(X)(to_numpy(x) for x in X)

if _is_slicedataset(X):
return np.asarray(X)

if not is_torch_data_type(X):
raise TypeError("Cannot convert this data type to a numpy array.")

Expand Down