diff --git a/.gitignore b/.gitignore index e05d31b..88db6e7 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,7 @@ dmypy.json .pyre/ #Pycharm -.idea \ No newline at end of file +.idea + +#VSCode +.vscode \ No newline at end of file diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py index 2ea6e14..28a855b 100644 --- a/sklearn_genetic/genetic_search.py +++ b/sklearn_genetic/genetic_search.py @@ -257,7 +257,7 @@ def __init__( self.pre_dispatch = pre_dispatch self.error_score = error_score self.return_train_score = return_train_score - self.creator = creator + # self.creator = creator self.log_config = log_config # Check that the estimator is compatible with scikit-learn @@ -299,8 +299,8 @@ def _register(self): """ self.toolbox = base.Toolbox() - self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign]) - self.creator.create("Individual", list, fitness=creator.FitnessMax) + creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign]) + creator.create("Individual", list, fitness=creator.FitnessMax) attributes = [] # Assign all the parameters defined in the param_grid @@ -342,7 +342,7 @@ def _register(self): self._pop = self.toolbox.population(n=self.population_size) self._hof = tools.HallOfFame(self.keep_top_k) - self._stats = tools.Statistics(lambda ind: ind.fitness.values) + self._stats = tools.Statistics(getter_fittness_values) self._stats.register("fitness", np.mean) self._stats.register("fitness_std", np.std) self._stats.register("fitness_max", np.max) @@ -540,8 +540,8 @@ def fit(self, X, y, callbacks=None): for k in range(len(self._hof)) } - del self.creator.FitnessMax - del self.creator.Individual + del creator.FitnessMax + del creator.Individual return self @@ -878,7 +878,7 @@ def __init__( self.pre_dispatch = pre_dispatch self.error_score = error_score self.return_train_score = return_train_score - self.creator = creator + # self.creator = creator self.log_config = log_config # Check that the estimator is compatible with scikit-learn @@ -902,8 +902,8 @@ def _register(self): # Criteria sign to set max or min problem # And -1.0 as second weight to minimize number of features - self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, -1.0]) - self.creator.create("Individual", list, fitness=creator.FitnessMax) + creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, -1.0]) + creator.create("Individual", list, fitness=creator.FitnessMax) # Register the array to choose the features # Each binary value represents if the feature is selected or not @@ -933,7 +933,7 @@ def _register(self): # Stats among axis 0 to get two values: # One based on the score and the other in the number of features - self._stats = tools.Statistics(lambda ind: ind.fitness.values) + self._stats = tools.Statistics(getter_fittness_values) self._stats.register("fitness", np.mean, axis=0) self._stats.register("fitness_std", np.std, axis=0) self._stats.register("fitness_max", np.max, axis=0) @@ -1104,8 +1104,8 @@ def fit(self, X, y, callbacks=None): self.hof = self._hof - del self.creator.FitnessMax - del self.creator.Individual + del creator.FitnessMax + del creator.Individual return self @@ -1365,3 +1365,10 @@ def score(self, X, y): ``best_estimator_.score`` method otherwise. """ return self.estimator.score(self.transform(X), y) + + +# helpers + + +def getter_fittness_values(ind): + return ind.fitness.values diff --git a/sklearn_genetic/tests/test_serialization.py b/sklearn_genetic/tests/test_serialization.py new file mode 100644 index 0000000..c2b9fad --- /dev/null +++ b/sklearn_genetic/tests/test_serialization.py @@ -0,0 +1,105 @@ +import pytest +from sklearn.datasets import load_iris, load_diabetes +from sklearn.linear_model import SGDClassifier +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.model_selection import train_test_split +from sklearn.utils.validation import check_is_fitted +from sklearn.cluster import KMeans +from sklearn.metrics import accuracy_score, balanced_accuracy_score +from sklearn.metrics import make_scorer +import numpy as np + +from .. import GAFeatureSelectionCV +from ..callbacks import ( + ThresholdStopping, + DeltaThreshold, + ConsecutiveStopping, + TimerStopping, + ProgressBar, +) +from ..schedules import ExponentialAdapter, InverseAdapter +from joblib import dump, load +import os + + +data = load_iris() +label_names = data["target_names"] +y = data["target"] +X = data["data"] + +noise = np.random.uniform(1, 4, size=(X.shape[0], 10)) + +X = np.hstack((X, noise)) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) + + +def test_estimator_serialization(): + clf = SGDClassifier(loss="modified_huber", fit_intercept=True) + generations = 6 + evolved_estimator = GAFeatureSelectionCV( + clf, + cv=3, + scoring="accuracy", + population_size=6, + generations=generations, + tournament_size=3, + elitism=False, + keep_top_k=4, + verbose=False, + algorithm="eaSimple", + n_jobs=-1, + return_train_score=True, + ) + + evolved_estimator.fit(X_train, y_train) + dump_file = "evolved_estimator.pkl" + + # test dump + assert dump(evolved_estimator, dump_file)[0] == dump_file + + # load + dumped_estimator = load(dump_file) + features = dumped_estimator.support_ + + assert check_is_fitted(dumped_estimator) is None + assert features.shape[0] == X.shape[1] + assert len(dumped_estimator) == generations + 1 # +1 random initial population + assert len(dumped_estimator.predict(X_test)) == len(X_test) + assert dumped_estimator.score(X_train, y_train) >= 0 + assert len(dumped_estimator.decision_function(X_test)) == len(X_test) + assert len(dumped_estimator.predict_proba(X_test)) == len(X_test) + assert len(dumped_estimator.predict_log_proba(X_test)) == len(X_test) + assert dumped_estimator.score(X_test, y_test) == accuracy_score( + y_test, dumped_estimator.predict(X_test) + ) + assert bool(dumped_estimator.get_params()) + assert len(dumped_estimator.hof) == dumped_estimator.keep_top_k + assert "gen" in dumped_estimator[0] + assert "fitness_max" in dumped_estimator[0] + assert "fitness" in dumped_estimator[0] + assert "fitness_std" in dumped_estimator[0] + assert "fitness_min" in dumped_estimator[0] + + cv_results_ = dumped_estimator.cv_results_ + cv_result_keys = set(cv_results_.keys()) + + assert "split0_test_score" in cv_result_keys + assert "split1_test_score" in cv_result_keys + assert "split2_test_score" in cv_result_keys + assert "split0_train_score" in cv_result_keys + assert "split1_train_score" in cv_result_keys + assert "split2_train_score" in cv_result_keys + assert "mean_test_score" in cv_result_keys + assert "std_test_score" in cv_result_keys + assert "rank_test_score" in cv_result_keys + assert "mean_train_score" in cv_result_keys + assert "std_train_score" in cv_result_keys + assert "rank_train_score" in cv_result_keys + assert "std_fit_time" in cv_result_keys + assert "mean_score_time" in cv_result_keys + assert "rank_n_features" in cv_result_keys + assert "features" in cv_result_keys + + # delete dumped estimator + os.remove(dump_file)