diff --git a/README.md b/README.md index 4ffa2b6..dc4fdb4 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,9 @@ from sklearn.model_selection import cross_validate kf = KFold(n_splits=5, shuffle=True, random_state=334) print(cross_validate(estimator, X, y, cv=kf)) ``` + OR + ```python from sklearn.model_selection import cross_validate @@ -136,10 +138,20 @@ If you want to run the notebook in example directory, you will need to additiona ## Parallelization (since v2.1.0) -This algorithm is very computationally intensive and takes a lot of computation time. -To solve this problem, we have implemented parallelization and optimized the algorithm since v2.1.0. +This algorithm is very computationally intensive and takes a lot of time. +To solve this problem, I have implemented parallelization and optimized the algorithm since v2.1.0. `n_jobs` can be specified for parallelization as in the scikit-learn-like api. +```python +# parallelization KFold +kf = KFold(n_splits=5, n_jobs=-1) + +# parallelization train_test_split +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, n_jobs=-1 +) +``` + ## LICENSE @@ -176,5 +188,9 @@ Copyright (c) 2021 yu9824 - Optimize algorithm - Deal with Large number of data. - parallel calculation when calculating distance (Add `n_jobs` argument) - - recursion number settings + - replacing recursive functions with for-loops - Add other than "euclidean" calculation methods (Add `metric` argument) + +### v2.1.1 + +- Fix bug when `metric="nan_euclidean"`. diff --git a/kennard_stone/__init__.py b/kennard_stone/__init__.py index 2b7bc96..6e10d61 100644 --- a/kennard_stone/__init__.py +++ b/kennard_stone/__init__.py @@ -1,6 +1,6 @@ from .kennard_stone import KFold, train_test_split -__version__ = "2.1.0" +__version__ = "2.1.1" __license__ = "MIT" __author__ = "yu9824" __copyright__ = "Copyright © 2021 yu9824" diff --git a/kennard_stone/kennard_stone.py b/kennard_stone/kennard_stone.py index aad39ea..b3e52c2 100644 --- a/kennard_stone/kennard_stone.py +++ b/kennard_stone/kennard_stone.py @@ -4,13 +4,13 @@ from typing import overload, Union, Optional, Generator -# deprecated in Python >= 3.9 +# The fllowing has deprecated in Python >= 3.9 from typing import List, Set + from itertools import chain import warnings import numpy as np - from sklearn.model_selection._split import BaseShuffleSplit from sklearn.model_selection._split import _BaseKFold from sklearn.model_selection._split import _validate_shuffle_split @@ -44,7 +44,8 @@ def __init__( *, metric: str = "euclidean", n_jobs: Optional[int] = None, - **kwargs, + random_state: None = None, + shuffle: None = None, ) -> None: """K-Folds cross-validator using the Kennard-Stone algorithm. @@ -78,7 +79,7 @@ def __init__( self.metric = metric self.n_jobs = n_jobs - if "shuffle" in kwargs: + if shuffle is not None: warnings.warn( "`shuffle` is unnecessary because it is always shuffled" " in this algorithm.", @@ -86,7 +87,7 @@ def __init__( ) del self.shuffle - if "random_state" in kwargs: + if random_state is not None: warnings.warn( "`random_state` is unnecessary since it is uniquely determined" " in this algorithm.", @@ -166,7 +167,8 @@ def train_test_split( train_size: Optional[Union[float, int]] = None, metric: str = "euclidean", n_jobs: Optional[int] = None, - **kwargs, + random_state: None = None, + shuffle: None = None, ) -> list: """Split arrays or matrices into train and test subsets using the Kennard-Stone algorithm. @@ -223,14 +225,14 @@ def train_test_split( ------ ValueError """ - if "shuffle" in kwargs: + if shuffle is not None: warnings.warn( "`shuffle` is unnecessary because it is always shuffled" " in this algorithm.", UserWarning, ) - if "random_state" in kwargs: + if random_state is not None: warnings.warn( "`random_state` is unnecessary since it is uniquely determined" " in this algorithm.", @@ -308,7 +310,14 @@ def __init__( def get_indexes(self, X) -> List[List[int]]: # check input array - X: np.ndarray = check_array(X, ensure_2d=True, dtype="numeric") + X: np.ndarray = check_array( + X, + ensure_2d=True, + dtype="numeric", + force_all_finite="allow-nan" + if self.metric == "nan_euclidean" + else True, + ) n_samples = X.shape[0] # drop no variance @@ -343,8 +352,6 @@ def get_indexes(self, X) -> List[List[int]]: distance_min = self.distance_matrix[idx_farthest, :] - # recursion limit settings - # params indexes_selected = idx_farthest lst_indexes_selected_prev = [[] for _ in range(self.n_groups)] diff --git a/tests/test_nan_euclidean.py b/tests/test_nan_euclidean.py new file mode 100644 index 0000000..5216c54 --- /dev/null +++ b/tests/test_nan_euclidean.py @@ -0,0 +1,12 @@ +from sklearn.datasets import load_diabetes + +from kennard_stone.kennard_stone import _KennardStone + + +def test_nan_euclidean(): + X = load_diabetes(as_frame=True).data.copy() + + X.iloc[1, 1] = float("nan") + + ks = _KennardStone(n_groups=1, metric="nan_euclidean") + ks.get_indexes(X)