From 82c4519137db2957035ac8fc6c7717f309edf82a Mon Sep 17 00:00:00 2001 From: yu9824 <58211916+yu9824@users.noreply.github.com> Date: Thu, 4 May 2023 16:06:43 +0900 Subject: [PATCH 1/6] Update README --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ffa2b6..d16a930 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,9 @@ from sklearn.model_selection import cross_validate kf = KFold(n_splits=5, shuffle=True, random_state=334) print(cross_validate(estimator, X, y, cv=kf)) ``` + OR + ```python from sklearn.model_selection import cross_validate @@ -136,10 +138,20 @@ If you want to run the notebook in example directory, you will need to additiona ## Parallelization (since v2.1.0) -This algorithm is very computationally intensive and takes a lot of computation time. -To solve this problem, we have implemented parallelization and optimized the algorithm since v2.1.0. +This algorithm is very computationally intensive and takes a lot of time. +To solve this problem, I have implemented parallelization and optimized the algorithm since v2.1.0. `n_jobs` can be specified for parallelization as in the scikit-learn-like api. +```python +# parallelization KFold +kf = KFold(n_splits=5, n_jobs=-1) + +# parallelization train_test_split +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, n_jobs=-1 +) +``` + ## LICENSE From 0e77cf17c2096e64b009cad8e01f69e86cfa99c4 Mon Sep 17 00:00:00 2001 From: yu9824 <58211916+yu9824@users.noreply.github.com> Date: Thu, 4 May 2023 16:41:27 +0900 Subject: [PATCH 2/6] Fix #10 and create test for it --- kennard_stone/kennard_stone.py | 13 ++++++++++--- tests/test_nan_euclidean.py | 12 ++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 tests/test_nan_euclidean.py diff --git a/kennard_stone/kennard_stone.py b/kennard_stone/kennard_stone.py index aad39ea..f2b3e6a 100644 --- a/kennard_stone/kennard_stone.py +++ b/kennard_stone/kennard_stone.py @@ -4,13 +4,13 @@ from typing import overload, Union, Optional, Generator -# deprecated in Python >= 3.9 +# The fllowing has deprecated in Python >= 3.9 from typing import List, Set + from itertools import chain import warnings import numpy as np - from sklearn.model_selection._split import BaseShuffleSplit from sklearn.model_selection._split import _BaseKFold from sklearn.model_selection._split import _validate_shuffle_split @@ -308,7 +308,14 @@ def __init__( def get_indexes(self, X) -> List[List[int]]: # check input array - X: np.ndarray = check_array(X, ensure_2d=True, dtype="numeric") + X: np.ndarray = check_array( + X, + ensure_2d=True, + dtype="numeric", + force_all_finite="allow-nan" + if self.metric == "nan_euclidean" + else True, + ) n_samples = X.shape[0] # drop no variance diff --git a/tests/test_nan_euclidean.py b/tests/test_nan_euclidean.py new file mode 100644 index 0000000..5216c54 --- /dev/null +++ b/tests/test_nan_euclidean.py @@ -0,0 +1,12 @@ +from sklearn.datasets import load_diabetes + +from kennard_stone.kennard_stone import _KennardStone + + +def test_nan_euclidean(): + X = load_diabetes(as_frame=True).data.copy() + + X.iloc[1, 1] = float("nan") + + ks = _KennardStone(n_groups=1, metric="nan_euclidean") + ks.get_indexes(X) From 8acfcf848b4b83409b05668de89be45593e4cb67 Mon Sep 17 00:00:00 2001 From: yu9824 <58211916+yu9824@users.noreply.github.com> Date: Thu, 4 May 2023 16:44:27 +0900 Subject: [PATCH 3/6] Update README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index d16a930..3429e42 100644 --- a/README.md +++ b/README.md @@ -190,3 +190,7 @@ Copyright (c) 2021 yu9824 - parallel calculation when calculating distance (Add `n_jobs` argument) - recursion number settings - Add other than "euclidean" calculation methods (Add `metric` argument) + +### v2.1.1 + +- Fix bug when `metric="nan_euclidean"`. From 5f4860d2143725ce71fd6b2a1a96f24f75324d8c Mon Sep 17 00:00:00 2001 From: yu9824 <58211916+yu9824@users.noreply.github.com> Date: Thu, 4 May 2023 16:45:49 +0900 Subject: [PATCH 4/6] v2.1.1 --- kennard_stone/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kennard_stone/__init__.py b/kennard_stone/__init__.py index 2b7bc96..6e10d61 100644 --- a/kennard_stone/__init__.py +++ b/kennard_stone/__init__.py @@ -1,6 +1,6 @@ from .kennard_stone import KFold, train_test_split -__version__ = "2.1.0" +__version__ = "2.1.1" __license__ = "MIT" __author__ = "yu9824" __copyright__ = "Copyright © 2021 yu9824" From 7c53ae8b14f66ac70147999039a2298e77c48dec Mon Sep 17 00:00:00 2001 From: yu9824 <58211916+yu9824@users.noreply.github.com> Date: Sat, 6 May 2023 13:42:31 +0900 Subject: [PATCH 5/6] Update History --- README.md | 2 +- kennard_stone/kennard_stone.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 3429e42..dc4fdb4 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ Copyright (c) 2021 yu9824 - Optimize algorithm - Deal with Large number of data. - parallel calculation when calculating distance (Add `n_jobs` argument) - - recursion number settings + - replacing recursive functions with for-loops - Add other than "euclidean" calculation methods (Add `metric` argument) ### v2.1.1 diff --git a/kennard_stone/kennard_stone.py b/kennard_stone/kennard_stone.py index f2b3e6a..bc5528e 100644 --- a/kennard_stone/kennard_stone.py +++ b/kennard_stone/kennard_stone.py @@ -350,8 +350,6 @@ def get_indexes(self, X) -> List[List[int]]: distance_min = self.distance_matrix[idx_farthest, :] - # recursion limit settings - # params indexes_selected = idx_farthest lst_indexes_selected_prev = [[] for _ in range(self.n_groups)] From d2db6320d24fba1aaa84329e086df91e4bd777db Mon Sep 17 00:00:00 2001 From: yu9824 <58211916+yu9824@users.noreply.github.com> Date: Sat, 6 May 2023 13:53:16 +0900 Subject: [PATCH 6/6] Remove kwargs to avoid error overhead. --- kennard_stone/kennard_stone.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kennard_stone/kennard_stone.py b/kennard_stone/kennard_stone.py index bc5528e..b3e52c2 100644 --- a/kennard_stone/kennard_stone.py +++ b/kennard_stone/kennard_stone.py @@ -44,7 +44,8 @@ def __init__( *, metric: str = "euclidean", n_jobs: Optional[int] = None, - **kwargs, + random_state: None = None, + shuffle: None = None, ) -> None: """K-Folds cross-validator using the Kennard-Stone algorithm. @@ -78,7 +79,7 @@ def __init__( self.metric = metric self.n_jobs = n_jobs - if "shuffle" in kwargs: + if shuffle is not None: warnings.warn( "`shuffle` is unnecessary because it is always shuffled" " in this algorithm.", @@ -86,7 +87,7 @@ def __init__( ) del self.shuffle - if "random_state" in kwargs: + if random_state is not None: warnings.warn( "`random_state` is unnecessary since it is uniquely determined" " in this algorithm.", @@ -166,7 +167,8 @@ def train_test_split( train_size: Optional[Union[float, int]] = None, metric: str = "euclidean", n_jobs: Optional[int] = None, - **kwargs, + random_state: None = None, + shuffle: None = None, ) -> list: """Split arrays or matrices into train and test subsets using the Kennard-Stone algorithm. @@ -223,14 +225,14 @@ def train_test_split( ------ ValueError """ - if "shuffle" in kwargs: + if shuffle is not None: warnings.warn( "`shuffle` is unnecessary because it is always shuffled" " in this algorithm.", UserWarning, ) - if "random_state" in kwargs: + if random_state is not None: warnings.warn( "`random_state` is unnecessary since it is uniquely determined" " in this algorithm.",