Skip to content

Commit

Permalink
Merge pull request #11 from yu9824/dev
Browse files Browse the repository at this point in the history
v2.1.1 release

Fix #10 .
  • Loading branch information
yu9824 authored May 6, 2023
2 parents 016571c + d2db632 commit a83718b
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 15 deletions.
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ from sklearn.model_selection import cross_validate
kf = KFold(n_splits=5, shuffle=True, random_state=334)
print(cross_validate(estimator, X, y, cv=kf))
```

OR

```python
from sklearn.model_selection import cross_validate

Expand All @@ -136,10 +138,20 @@ If you want to run the notebook in example directory, you will need to additiona

## Parallelization (since v2.1.0)

This algorithm is very computationally intensive and takes a lot of computation time.
To solve this problem, we have implemented parallelization and optimized the algorithm since v2.1.0.
This algorithm is very computationally intensive and takes a lot of time.
To solve this problem, I have implemented parallelization and optimized the algorithm since v2.1.0.
`n_jobs` can be specified for parallelization as in the scikit-learn-like api.

```python
# parallelization KFold
kf = KFold(n_splits=5, n_jobs=-1)

# parallelization train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, n_jobs=-1
)
```


## LICENSE

Expand Down Expand Up @@ -176,5 +188,9 @@ Copyright (c) 2021 yu9824
- Optimize algorithm
- Deal with Large number of data.
- parallel calculation when calculating distance (Add `n_jobs` argument)
- recursion number settings
- replacing recursive functions with for-loops
- Add other than "euclidean" calculation methods (Add `metric` argument)

### v2.1.1

- Fix bug when `metric="nan_euclidean"`.
2 changes: 1 addition & 1 deletion kennard_stone/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .kennard_stone import KFold, train_test_split

__version__ = "2.1.0"
__version__ = "2.1.1"
__license__ = "MIT"
__author__ = "yu9824"
__copyright__ = "Copyright © 2021 yu9824"
Expand Down
29 changes: 18 additions & 11 deletions kennard_stone/kennard_stone.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

from typing import overload, Union, Optional, Generator

# deprecated in Python >= 3.9
# The fllowing has deprecated in Python >= 3.9
from typing import List, Set

from itertools import chain
import warnings

import numpy as np

from sklearn.model_selection._split import BaseShuffleSplit
from sklearn.model_selection._split import _BaseKFold
from sklearn.model_selection._split import _validate_shuffle_split
Expand Down Expand Up @@ -44,7 +44,8 @@ def __init__(
*,
metric: str = "euclidean",
n_jobs: Optional[int] = None,
**kwargs,
random_state: None = None,
shuffle: None = None,
) -> None:
"""K-Folds cross-validator using the Kennard-Stone algorithm.
Expand Down Expand Up @@ -78,15 +79,15 @@ def __init__(
self.metric = metric
self.n_jobs = n_jobs

if "shuffle" in kwargs:
if shuffle is not None:
warnings.warn(
"`shuffle` is unnecessary because it is always shuffled"
" in this algorithm.",
UserWarning,
)
del self.shuffle

if "random_state" in kwargs:
if random_state is not None:
warnings.warn(
"`random_state` is unnecessary since it is uniquely determined"
" in this algorithm.",
Expand Down Expand Up @@ -166,7 +167,8 @@ def train_test_split(
train_size: Optional[Union[float, int]] = None,
metric: str = "euclidean",
n_jobs: Optional[int] = None,
**kwargs,
random_state: None = None,
shuffle: None = None,
) -> list:
"""Split arrays or matrices into train and test subsets using the
Kennard-Stone algorithm.
Expand Down Expand Up @@ -223,14 +225,14 @@ def train_test_split(
------
ValueError
"""
if "shuffle" in kwargs:
if shuffle is not None:
warnings.warn(
"`shuffle` is unnecessary because it is always shuffled"
" in this algorithm.",
UserWarning,
)

if "random_state" in kwargs:
if random_state is not None:
warnings.warn(
"`random_state` is unnecessary since it is uniquely determined"
" in this algorithm.",
Expand Down Expand Up @@ -308,7 +310,14 @@ def __init__(

def get_indexes(self, X) -> List[List[int]]:
# check input array
X: np.ndarray = check_array(X, ensure_2d=True, dtype="numeric")
X: np.ndarray = check_array(
X,
ensure_2d=True,
dtype="numeric",
force_all_finite="allow-nan"
if self.metric == "nan_euclidean"
else True,
)
n_samples = X.shape[0]

# drop no variance
Expand Down Expand Up @@ -343,8 +352,6 @@ def get_indexes(self, X) -> List[List[int]]:

distance_min = self.distance_matrix[idx_farthest, :]

# recursion limit settings

# params
indexes_selected = idx_farthest
lst_indexes_selected_prev = [[] for _ in range(self.n_groups)]
Expand Down
12 changes: 12 additions & 0 deletions tests/test_nan_euclidean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from sklearn.datasets import load_diabetes

from kennard_stone.kennard_stone import _KennardStone


def test_nan_euclidean():
X = load_diabetes(as_frame=True).data.copy()

X.iloc[1, 1] = float("nan")

ks = _KennardStone(n_groups=1, metric="nan_euclidean")
ks.get_indexes(X)

0 comments on commit a83718b

Please sign in to comment.