diff --git a/.github/workflows/python-package-conda.yaml b/.github/workflows/pytest-on-each-version.yaml similarity index 96% rename from .github/workflows/python-package-conda.yaml rename to .github/workflows/pytest-on-each-version.yaml index ad4cc06..b8039a0 100644 --- a/.github/workflows/python-package-conda.yaml +++ b/.github/workflows/pytest-on-each-version.yaml @@ -1,6 +1,6 @@ # Regrences # - https://enu23456.hatenablog.com/entry/2022/11/24/195744 -name: Python Package using Conda +name: Test on each version on: [push] diff --git a/README.md b/README.md index 2d56bb5..4ffa2b6 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # Kennard Stone + [![python_badge](https://img.shields.io/pypi/pyversions/kennard-stone)](https://pypi.org/project/kennard-stone/) [![license_badge](https://img.shields.io/pypi/l/kennard-stone)](https://pypi.org/project/kennard-stone/) [![PyPI version](https://badge.fury.io/py/kennard-stone.svg)](https://pypi.org/project/kennard-stone/) [![Downloads](https://pepy.tech/badge/kennard-stone)](https://pepy.tech/project/kennard-stone) -[![Python Package using Conda](https://github.com/yu9824/kennard_stone/actions/workflows/python-package-conda.yaml/badge.svg)](https://github.com/yu9824/kennard_stone/actions/workflows/python-package-conda.yaml) +[![Test on each version](https://github.com/yu9824/kennard_stone/actions/workflows/pytest-on-each-version.yaml/badge.svg)](https://github.com/yu9824/kennard_stone/actions/workflows/pytest-on-each-version.yaml) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Anaconda-Server Badge](https://anaconda.org/conda-forge/kennard-stone/badges/version.svg)](https://anaconda.org/conda-forge/kennard-stone) [![Anaconda-platform badge](https://anaconda.org/conda-forge/kennard-stone/badges/platforms.svg)](https://anaconda.org/conda-forge/kennard-stone) @@ -133,6 +134,13 @@ If these arguments are included, they do not cause an error. They simply have no If you want to run the notebook in example directory, you will need to additionally download `pandas`, `matplotlib`, `seaborn`, `tqdm`, and `jupyter` other than the packages in requirements.txt. +## Parallelization (since v2.1.0) + +This algorithm is very computationally intensive and takes a lot of computation time. +To solve this problem, we have implemented parallelization and optimized the algorithm since v2.1.0. +`n_jobs` can be specified for parallelization as in the scikit-learn-like api. + + ## LICENSE MIT Licence @@ -141,13 +149,14 @@ Copyright (c) 2021 yu9824 ## References + ### Papers -* R. W. Kennard & L. A. Stone (1969) Computer Aided Design of Experiments, Technometrics, 11:1, 137-148, DOI: [10.1080/00401706.1969.10490666](https://doi.org/10.1080/00401706.1969.10490666) +- R. W. Kennard & L. A. Stone (1969) Computer Aided Design of Experiments, Technometrics, 11:1, 137-148, DOI: [10.1080/00401706.1969.10490666](https://doi.org/10.1080/00401706.1969.10490666) ### Sites -* [https://datachemeng.com/trainingtestdivision/](https://datachemeng.com/trainingtestdivision/) (Japanese site) +- [https://datachemeng.com/trainingtestdivision/](https://datachemeng.com/trainingtestdivision/) (Japanese site) ## Histories @@ -161,3 +170,11 @@ Copyright (c) 2021 yu9824 ### v2.0.1 - Fix bug with Python3.7. + +### v2.1.0 + +- Optimize algorithm +- Deal with Large number of data. + - parallel calculation when calculating distance (Add `n_jobs` argument) + - recursion number settings +- Add other than "euclidean" calculation methods (Add `metric` argument) diff --git a/kennard_stone/__init__.py b/kennard_stone/__init__.py index cdab5c7..2b7bc96 100644 --- a/kennard_stone/__init__.py +++ b/kennard_stone/__init__.py @@ -1,6 +1,6 @@ from .kennard_stone import KFold, train_test_split -__version__ = "2.0.1" +__version__ = "2.1.0" __license__ = "MIT" __author__ = "yu9824" __copyright__ = "Copyright © 2021 yu9824" diff --git a/kennard_stone/kennard_stone.py b/kennard_stone/kennard_stone.py index e072137..aad39ea 100644 --- a/kennard_stone/kennard_stone.py +++ b/kennard_stone/kennard_stone.py @@ -2,7 +2,10 @@ Copyright © 2021 yu9824 """ -from typing import List, Union, Optional +from typing import overload, Union, Optional, Generator + +# deprecated in Python >= 3.9 +from typing import List, Set from itertools import chain import warnings @@ -12,24 +15,68 @@ from sklearn.model_selection._split import _BaseKFold from sklearn.model_selection._split import _validate_shuffle_split from sklearn.utils.validation import _num_samples -from sklearn.utils import indexable, _safe_indexing +from sklearn.utils import indexable +from sklearn.utils import _safe_indexing from sklearn.preprocessing import StandardScaler +from sklearn.feature_selection import VarianceThreshold +from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils import check_array -# TODO: unittest? # TODO: sphinx documentation? +# TODO: parallelization + class KFold(_BaseKFold): - def __init__(self, n_splits: int = 5, **kwargs): + @overload + def __init__( + self, + n_splits: int = 5, + *, + metric: str = "euclidean", + n_jobs: Optional[int] = None, + ) -> None: + pass + + def __init__( + self, + n_splits: int = 5, + *, + metric: str = "euclidean", + n_jobs: Optional[int] = None, + **kwargs, + ) -> None: """K-Folds cross-validator using the Kennard-Stone algorithm. Parameters ---------- n_splits : int, optional Number of folds. Must be at least 2., by default 5 + + metric : str, optional + The distance metric to use. See the documentation of + `sklearn.metrics.pairwise_distances` for valid values. + , by default "euclidean" + + =============== ======================================== + metric Function + =============== ======================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'haversine' metrics.pairwise.haversine_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + 'nan_euclidean' metrics.pairwise.nan_euclidean_distances + =============== ======================================== + + n_jobs : int, optional + The number of parallel jobs., by default None """ super().__init__(n_splits=n_splits, shuffle=False, random_state=None) + self.metric = metric + self.n_jobs = n_jobs if "shuffle" in kwargs: warnings.warn( @@ -47,8 +94,15 @@ def __init__(self, n_splits: int = 5, **kwargs): ) del self.random_state - def _iter_test_indices(self, X=None, y=None, groups=None): - ks = _KennardStone(n_groups=self.get_n_splits()) + def _iter_test_indices( + self, X=None, y=None, groups=None + ) -> Generator[List[int], None, None]: + ks = _KennardStone( + n_groups=self.get_n_splits(), + scale=True, + metric=self.metric, + n_jobs=self.n_jobs, + ) indexes = ks.get_indexes(X) for index in indexes: @@ -61,17 +115,24 @@ def __init__( n_splits: int = 1, *, test_size: Optional[Union[float, int]] = None, - train_size: Optional[Union[float, int]] = None + train_size: Optional[Union[float, int]] = None, + metric: str = "euclidean", + n_jobs: Optional[int] = None, ): super().__init__( n_splits=n_splits, test_size=test_size, train_size=train_size ) + self.metric = metric + self.n_jobs = n_jobs + assert self.get_n_splits() == 1, "n_splits must be 1" self._default_test_size = 0.1 # overwrap abstractmethod def _iter_indices(self, X, y=None, groups=None): - ks = _KennardStone(n_groups=1) + ks = _KennardStone( + n_groups=1, scale=True, metric=self.metric, n_jobs=self.n_jobs + ) indexes = ks.get_indexes(X)[0] n_samples = _num_samples(X) @@ -88,27 +149,71 @@ def _iter_indices(self, X, y=None, groups=None): yield ind_train, ind_test -def train_test_split(*arrays, test_size=None, train_size=None, **kwargs): +@overload +def train_test_split( + *arrays, + test_size: Optional[Union[float, int]] = None, + train_size: Optional[Union[float, int]] = None, + metric: str = "euclidean", + n_jobs: Optional[int] = None, +) -> list: + pass + + +def train_test_split( + *arrays, + test_size: Optional[Union[float, int]] = None, + train_size: Optional[Union[float, int]] = None, + metric: str = "euclidean", + n_jobs: Optional[int] = None, + **kwargs, +) -> list: """Split arrays or matrices into train and test subsets using the Kennard-Stone algorithm. + Data partitioning by the Kennard-Stone algorithm is performed based on the + first element to be input. + Parameters ---------- *arrays: sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. + test_size : float or int, optional If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25., by default None + train_size : float or int, optional If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size., by default None + metric : str, optional + The distance metric to use. See the documentation of + `sklearn.metrics.pairwise_distances` for valid values. + , by default "euclidean" + + =============== ======================================== + metric Function + =============== ======================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'haversine' metrics.pairwise.haversine_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + 'nan_euclidean' metrics.pairwise.nan_euclidean_distances + =============== ======================================== + + n_jobs : int, optional + The number of parallel jobs., by default None + Returns ------- splitting : list, length=2 * len(arrays) @@ -118,6 +223,13 @@ def train_test_split(*arrays, test_size=None, train_size=None, **kwargs): ------ ValueError """ + if "shuffle" in kwargs: + warnings.warn( + "`shuffle` is unnecessary because it is always shuffled" + " in this algorithm.", + UserWarning, + ) + if "random_state" in kwargs: warnings.warn( "`random_state` is unnecessary since it is uniquely determined" @@ -137,7 +249,9 @@ def train_test_split(*arrays, test_size=None, train_size=None, **kwargs): ) CVClass = KSSplit - cv = CVClass(test_size=n_test, train_size=n_train) + cv = CVClass( + test_size=n_test, train_size=n_train, metric=metric, n_jobs=n_jobs + ) train, test = next(cv.split(X=arrays[0])) @@ -149,127 +263,198 @@ def train_test_split(*arrays, test_size=None, train_size=None, **kwargs): class _KennardStone: - def __init__(self, n_groups: int = 1, scale: bool = True) -> None: + def __init__( + self, + n_groups: int = 1, + scale: bool = True, + metric: str = "euclidean", + n_jobs: Optional[int] = None, + ) -> None: """The root program of the Kennard-Stone algorithm. Parameters ---------- n_groups : int, optional how many groups to divide, by default 1 + scale : bool, optional scaling X or not, by default True + + metric : str, optional + The distance metric to use. See the documentation of + `sklearn.metrics.pairwise_distances` for valid values. + , by default "euclidean" + + =============== ======================================== + metric Function + =============== ======================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'haversine' metrics.pairwise.haversine_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + 'nan_euclidean' metrics.pairwise.nan_euclidean_distances + =============== ======================================== + + n_jobs : int, optional + The number of parallel jobs., by default None """ self.n_groups = n_groups self.scale = scale + self.metric = metric + self.n_jobs = n_jobs def get_indexes(self, X) -> List[List[int]]: # check input array - X: np.ndarray = check_array(X, ensure_2d=True) + X: np.ndarray = check_array(X, ensure_2d=True, dtype="numeric") + n_samples = X.shape[0] + + # drop no variance + vselector = VarianceThreshold(threshold=0.0) + X = vselector.fit_transform(X) if self.scale: scaler = StandardScaler() X = scaler.fit_transform(X) # Save the original X. - self._original_X = X.copy() - - # 全ての組成に対してそれぞれの平均との距離の二乗を配列として得る. (サンプル数の分だけ存在) - distance_to_ave = np.sum(np.square(X - X.mean(axis=0)), axis=1) + # self._original_X = X.copy() - # 最大値を取るサンプル (平均からの距離が一番遠い) のindex_numberを保存 - idx_farthest = np.argsort(distance_to_ave)[::-1][: self.n_groups] - - # 抜き出した (train用) サンプルのindex_numberを保存しとくリスト - lst_idx_selected: List[List[int]] = [[_idx] for _idx in idx_farthest] - - # まだ抜き出しておらず,残っているサンプル (test用) サンプルのindex_numberを保存しておくリスト - idx_remaining = np.arange(len(X)) - - # 抜き出した (train用) サンプルに選ばれたサンプルをtrain用のものから削除 - X = np.delete(X, idx_farthest, axis=0) - idx_remaining = np.delete(idx_remaining, idx_farthest, axis=0) - - # 近い順のindexのリスト.i.e. 最初がtest向き,最後がtrain向き - indexes = self._sort( - X=X, lst_idx_selected=lst_idx_selected, idx_remaining=idx_remaining - ) - assert ( - len(list(chain.from_iterable(indexes))) - == len(set(chain.from_iterable(indexes))) - == len(self._original_X) + # Pre-calculate the distance matrix. + self.distance_matrix = pairwise_distances( + X, metric=self.metric, n_jobs=self.n_jobs ) - return indexes + # 全ての組成に対してそれぞれの平均との距離の二乗を配列として得る. (サンプル数の分だけ存在) + # distance_to_ave = np.sum(np.square(X - X.mean(axis=0)), axis=1) + distance_to_ave = pairwise_distances( + X, + X.mean(axis=0, keepdims=True), + metric=self.metric, + n_jobs=self.n_jobs, + ).flatten() - def _sort( - self, - X, - lst_idx_selected: List[List[int]], - idx_remaining: Union[List[int], np.ndarray], - ) -> List[List[int]]: - samples_selected: np.ndarray = self._original_X[ - list(chain.from_iterable(lst_idx_selected)) - ] - - # まだ選択されていない各サンプルにおいて、これまで選択されたすべてのサンプルとの間で - # ユークリッド距離を計算し,その最小の値を「代表長さ」とする. - - min_distance_to_samples_selected = np.sum( - np.square( - np.expand_dims(samples_selected, 1) - np.expand_dims(X, 0) - ), - axis=2, - ) + # 最大値を取るサンプル (平均からの距離が一番遠い) のindex_numberを保存 + idx_farthest: List[int] = np.argsort(distance_to_ave)[::-1][ + : self.n_groups + ].tolist() + + distance_min = self.distance_matrix[idx_farthest, :] + + # recursion limit settings + + # params + indexes_selected = idx_farthest + lst_indexes_selected_prev = [[] for _ in range(self.n_groups)] + indexes_remaining_prev = list(range(n_samples)) + + for _ in range( + n_samples // self.n_groups + bool(n_samples % self.n_groups) - 1 + ): + # collect the current indexes + indexes_remaining: List[int] = list() + arg_selected: List[int] = list() + for j, idx in enumerate(indexes_remaining_prev): + if idx in set(indexes_selected): + arg_selected.append(j) + else: + indexes_remaining.append(idx) + n_remaining = len(indexes_remaining) - _idxes_delete: List[int] = [] - n_selected = len(lst_idx_selected[0]) - for k in range(len(lst_idx_selected)): - if 0 < len(idx_remaining) - k: - _lst_sorted_args = np.argsort( - np.min( - min_distance_to_samples_selected[ - n_selected * k : n_selected * (k + 1) - ], - axis=0, - ), + lst_indexes_selected = [ + indexes_selected_prev + [index_selected] + for indexes_selected_prev, index_selected in zip( + lst_indexes_selected_prev, indexes_selected ) - j = len(idx_remaining) - 1 - while _lst_sorted_args[j] in set(_idxes_delete): - j -= 1 - else: + ] + # /collect the current indexes + + # 代表長さを決定する + distance_selected: np.ndarray = self.distance_matrix[ + np.ix_(indexes_selected, indexes_remaining) + ] + distance_min = np.delete(distance_min, arg_selected, axis=1) + + distance_min: np.ndarray = np.min( + np.concatenate( + [ + distance_selected.reshape(self.n_groups, 1, -1), + distance_min.reshape(self.n_groups, 1, -1), + ], + axis=1, + ), + axis=1, + ) + + # まだ選択されていない各サンプルにおいて、これまで選択されたすべてのサンプルとの間で + # ユークリッド距離を計算し,その最小の値を「代表長さ」とする. + + _st_arg_delete: Set[int] = set() + indexes_selected_next: List[int] = list() + for k in range(self.n_groups): + if k == 0: + arg_delete = np.argmax( + distance_min[k], + ) + elif 0 < n_remaining - k: + sorted_args = np.argsort( + distance_min[k], + ) # 最大値を取るサンプル (代表長さが最も大きい) のindex_numberを保存 - idx_selected = _lst_sorted_args[j] + for j in range(n_remaining - k, -1, -1): + arg_delete = sorted_args[j] + if arg_delete not in _st_arg_delete: + break + else: + break - lst_idx_selected[k].append(idx_remaining[idx_selected]) - _idxes_delete.append(idx_selected) - else: - break + _st_arg_delete.add(arg_delete) + index_selected: int = indexes_remaining[arg_delete] - # delete - X = np.delete(X, _idxes_delete, axis=0) - idx_remaining = np.delete(idx_remaining, _idxes_delete, axis=0) + indexes_selected_next.append(index_selected) - if len(idx_remaining): # まだ残っているなら再帰 - return self._sort(X, lst_idx_selected, idx_remaining) + indexes_selected = indexes_selected_next + lst_indexes_selected_prev = lst_indexes_selected + indexes_remaining_prev = indexes_remaining else: # もうないなら遠い順から近い順 (test側) に並べ替えて終える - return [_idx_selected[::-1] for _idx_selected in lst_idx_selected] + assert n_remaining - len(indexes_selected_next) <= 0 + indexes_output: List[List[int]] = [] + for k in range(self.n_groups): + indexes_selected_reversed = lst_indexes_selected[k][::-1] + if k < len(indexes_selected_next): + index_selected_next = indexes_selected_next[k] + indexes_output.append( + [index_selected_next] + indexes_selected_reversed + ) + else: + indexes_output.append(indexes_selected_reversed) + return indexes_output if __name__ == "__main__": - import pandas as pd from sklearn.model_selection import cross_validate - from sklearn.datasets import load_diabetes + from sklearn.datasets import load_diabetes, fetch_california_housing from sklearn.ensemble import RandomForestRegressor - from sklearn.metrics import mean_squared_error as mse + from sklearn.metrics import mean_squared_error + + data = fetch_california_housing(as_frame=True) + # data = load_diabetes(as_frame=True) + X = data.data + y = data.target - data = load_diabetes(as_frame=True) - X: pd.DataFrame = data.data - y: pd.Series = data.target + # ks = _KennardStone(n_groups=2, scale=True, n_jobs=-1) + # ks = _KennardStone(n_groups=1, scale=True, n_jobs=-1) + # ks.get_indexes(X) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, n_jobs=-1 + ) rf = RandomForestRegressor(n_jobs=-1, random_state=334) rf.fit(X_train, y_train) - print(mse(rf.predict(X_test), y_test)) + y_pred_on_test = rf.predict(X_test) + print(mean_squared_error(y_test, y_pred_on_test, squared=False)) - kf = KFold(n_splits=5) - print(cross_validate(rf, X, y, scoring="neg_mean_squared_error", cv=kf)) + # kf = KFold(n_splits=5, n_jobs=-1) + # print(cross_validate(rf, X, y, scoring="neg_mean_squared_error", cv=kf)) diff --git a/tests/test_large_data.py b/tests/test_large_data.py new file mode 100644 index 0000000..a8aac1b --- /dev/null +++ b/tests/test_large_data.py @@ -0,0 +1,18 @@ +from sklearn.datasets import fetch_california_housing +import pytest + +from kennard_stone import train_test_split + + +@pytest.fixture +def prepare_data(): + data = fetch_california_housing(as_frame=True) + X = data.data + y = data.target + return (X, y) + + +def test_train_test_split_with_large(prepare_data): + X_train, X_test, y_train, y_test = train_test_split( + *prepare_data, test_size=0.2, n_jobs=-1 + ) diff --git a/tests/test_odd_number_of_data.py b/tests/test_odd_number_of_data.py new file mode 100644 index 0000000..597ce6b --- /dev/null +++ b/tests/test_odd_number_of_data.py @@ -0,0 +1,13 @@ +from sklearn.datasets import load_diabetes + +from kennard_stone.kennard_stone import _KennardStone + + +def test_odd_number_of_data(): + diabetes = load_diabetes(as_frame=True) + + X = diabetes.data + _KennardStone(n_groups=2).get_indexes(X) + + X = X.iloc[:-1] + _KennardStone(n_groups=2).get_indexes(X)