Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

238 giving a fraction of samples instead of a number of samples in the subsample class #464

1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ History

* Fix the quantile formula to ensure valid coverage (deal with infinite interval production and asymmetric conformal scores).
* Fix sphinx dependencies
* Building a training set with a fraction between 0 and 1 with `n_samples` attribute when using `split` method from `Subsample` class.

0.8.5 (2024-06-07)
------------------
Expand Down
12 changes: 6 additions & 6 deletions mapie/subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.utils.validation import _num_samples

from ._typing import NDArray
from .utils import check_n_samples


class Subsample(BaseCrossValidator):
Expand All @@ -22,9 +23,10 @@ class Subsample(BaseCrossValidator):
----------
n_resamplings : int
Number of resamplings. By default ``30``.
n_samples: int
n_samples: float
LacombeLouis marked this conversation as resolved.
Show resolved Hide resolved
Number of samples in each resampling. By default ``None``,
the size of the training set.
the size of the training set. If it is between 0 and 1,
it becomes the fraction of samples
replace: bool
Whether to replace samples in resamplings or not. By default ``True``.
random_state: Optional[Union[int, RandomState]]
Expand All @@ -46,7 +48,7 @@ class Subsample(BaseCrossValidator):
def __init__(
self,
n_resamplings: int = 30,
n_samples: Optional[int] = None,
n_samples: Optional[Union[int, float]] = None,
replace: bool = True,
random_state: Optional[Union[int, RandomState]] = None,
) -> None:
Expand Down Expand Up @@ -74,9 +76,7 @@ def split(
The testing set indices for that split.
"""
indices = np.arange(_num_samples(X))
n_samples = (
self.n_samples if self.n_samples is not None else len(indices)
)
n_samples = check_n_samples(X, self.n_samples, indices)
random_state = check_random_state(self.random_state)
for k in range(self.n_resamplings):
train_index = resample(
Expand Down
44 changes: 44 additions & 0 deletions mapie/tests/test_subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,50 @@ def test_split_SubSample() -> None:
np.testing.assert_equal(tests, tests_expected)


@pytest.mark.parametrize("n_samples", [4, 6, 8, 10])
@pytest.mark.parametrize("n_resamplings", [1, 2, 3])
def test_n_samples_int(n_samples: int,
n_resamplings: int) -> None:
"""Test outputs of subsamplings when n_samples is a int"""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
cv = Subsample(n_resamplings=n_resamplings, random_state=0,
n_samples=n_samples, replace=False)
train_set = np.concatenate([x[0] for x in cv.split(X)])
val_set = np.concatenate([x[1] for x in cv.split(X)])
assert len(train_set) == n_samples*n_resamplings
assert len(val_set) == (X.shape[0] - n_samples)*n_resamplings


@pytest.mark.parametrize("n_samples", [0.4, 0.6, 0.8, 0.9])
@pytest.mark.parametrize("n_resamplings", [1, 2, 3])
def test_n_samples_float(n_samples: float,
n_resamplings: int) -> None:
"""Test outputs of subsamplings when n_samples is a
float between 0 and 1."""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
cv = Subsample(n_resamplings=n_resamplings, random_state=0,
n_samples=n_samples, replace=False)
train_set = np.concatenate([x[0] for x in cv.split(X)])
val_set = np.concatenate([x[1] for x in cv.split(X)])
assert len(train_set) == int(np.floor(n_samples*X.shape[0]))*n_resamplings
assert len(val_set) == (
(X.shape[0] - int(np.floor(n_samples * X.shape[0]))) *
n_resamplings
)


@pytest.mark.parametrize("n_resamplings", [1, 2, 3])
def test_n_samples_none(n_resamplings: int) -> None:
"""Test outputs of subsamplings when n_samples is None."""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
cv = Subsample(n_resamplings=n_resamplings, random_state=0,
replace=False)
train_set = np.concatenate([x[0] for x in cv.split(X)])
val_set = np.concatenate([x[1] for x in cv.split(X)])
assert len(train_set) == X.shape[0]*n_resamplings
assert len(val_set) == 0


def test_default_parameters_BlockBootstrap() -> None:
"""Test default values of Subsample."""
cv = BlockBootstrap()
Expand Down
69 changes: 67 additions & 2 deletions mapie/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

from typing import Any, Optional, Tuple

import numpy as np
import pytest
import re
from numpy.random import RandomState
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
Expand All @@ -17,7 +17,8 @@
check_array_inf, check_array_nan, check_arrays_length,
check_binary_zero_one, check_cv, check_gamma,
check_lower_upper_bounds, check_n_features_in,
check_n_jobs, check_no_agg_cv, check_null_weight,
check_n_jobs, check_no_agg_cv, check_n_samples,
LacombeLouis marked this conversation as resolved.
Show resolved Hide resolved
check_null_weight,
check_number_bins, check_split_strategy,
check_verbose, compute_quantiles, fit_estimator,
get_binning_groups)
Expand Down Expand Up @@ -508,3 +509,67 @@ def test_check_no_agg_cv_value_error(cv: Any) -> None:
match=r"Allowed values must have the `get_n_splits` method"
):
check_no_agg_cv(X_toy, cv, array)


@pytest.mark.parametrize("n_samples", [-4, -2, -1])
def test_invalid_n_samples_int_negative(n_samples: int) -> None:
"""Test that invalid n_samples raise errors."""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
indices = X.copy()
with pytest.raises(
ValueError,
match=re.escape(
r"Invalid n_samples. Allowed values "
r"are float in the range (0.0, 1.0) or"
r" int in the range [1, inf)"
)
):
check_n_samples(X=X, n_samples=n_samples, indices=indices)


@pytest.mark.parametrize("n_samples", [0.002, 0.003, 0.04])
def test_invalid_n_samples_int_zero(n_samples: int) -> None:
"""Test that invalid n_samples raise errors."""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
indices = X.copy()
with pytest.raises(
ValueError,
match=re.escape(
r"The value of n_samples is too small. "
r"You need to increase it so that n_samples*X.shape[0] > 1"
r"otherwise n_samples should be an int"
)
):
check_n_samples(X=X, n_samples=n_samples, indices=indices)


@pytest.mark.parametrize("n_samples", [-5.5, -4.3, -0.2])
def test_invalid_n_samples_float_negative(n_samples: float) -> None:
"""Test that invalid n_samples raise errors."""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
indices = X.copy()
with pytest.raises(
ValueError,
match=re.escape(
r"Invalid n_samples. Allowed values "
r"are float in the range (0.0, 1.0) or"
r" int in the range [1, inf)"
)
):
check_n_samples(X=X, n_samples=n_samples, indices=indices)


@pytest.mark.parametrize("n_samples", [1.2, 2.5, 3.4])
LacombeLouis marked this conversation as resolved.
Show resolved Hide resolved
def test_invalid_n_samples_float_greater_than_1(n_samples: float) -> None:
"""Test that invalid n_samples raise errors."""
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
indices = X.copy()
with pytest.raises(
ValueError,
match=re.escape(
r"Invalid n_samples. Allowed values "
r"are float in the range (0.0, 1.0) or"
r" int in the range [1, inf)"
)
):
check_n_samples(X=X, n_samples=n_samples, indices=indices)
53 changes: 53 additions & 0 deletions mapie/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,3 +1355,56 @@ def check_arrays_length(*arrays: NDArray) -> None:
raise ValueError(
"There are arrays with different length"
)


def check_n_samples(
LacombeLouis marked this conversation as resolved.
Show resolved Hide resolved
X: NDArray,
n_samples: Optional[Union[float, int]],
indices: NDArray
) -> int:
"""
Check alpha and prepare it as a ArrayLike.

Parameters
----------
n_samples: Union[float, int]
Can be a float between 0 and 1 or a int
Between 0 and 1, represent the part of data in the train sample
When n_samples is a int, it represents the number of elements
in the train sample

Returns
-------
int
n_samples

Raises
------
ValueError
If n_samples is not an int in the range [1, inf)
or a float in the range (0.0, 1.0)
"""
if n_samples is None:
n_samples = len(indices)
elif isinstance(n_samples, float):
if 0 < n_samples < 1:
n_samples = int(np.floor(n_samples * X.shape[0]))
LacombeLouis marked this conversation as resolved.
Show resolved Hide resolved
if n_samples == 0:
raise ValueError(
"The value of n_samples is too small. "
"You need to increase it so that n_samples*X.shape[0] > 1"
"otherwise n_samples should be an int"
)
else:
raise ValueError(
"Invalid n_samples. Allowed values "
"are float in the range (0.0, 1.0) or"
" int in the range [1, inf)"
)
elif isinstance(n_samples, int) and n_samples <= 0:
raise ValueError(
"Invalid n_samples. Allowed values "
"are float in the range (0.0, 1.0) or"
" int in the range [1, inf)"
)
thibaultcordier marked this conversation as resolved.
Show resolved Hide resolved
return int(n_samples)
LacombeLouis marked this conversation as resolved.
Show resolved Hide resolved
Loading