Skip to content

Commit

Permalink
fix the pickle error when dumping an estimator
Browse files Browse the repository at this point in the history
  • Loading branch information
imxtx committed Sep 8, 2023
1 parent 7e62418 commit 1fd975f
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 13 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,7 @@ dmypy.json
.pyre/

#Pycharm
.idea
.idea

#VSCode
.vscode
31 changes: 19 additions & 12 deletions sklearn_genetic/genetic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def __init__(
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
self.creator = creator
# self.creator = creator
self.log_config = log_config

# Check that the estimator is compatible with scikit-learn
Expand Down Expand Up @@ -299,8 +299,8 @@ def _register(self):
"""
self.toolbox = base.Toolbox()

self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign])
self.creator.create("Individual", list, fitness=creator.FitnessMax)
creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign])
creator.create("Individual", list, fitness=creator.FitnessMax)

attributes = []
# Assign all the parameters defined in the param_grid
Expand Down Expand Up @@ -342,7 +342,7 @@ def _register(self):
self._pop = self.toolbox.population(n=self.population_size)
self._hof = tools.HallOfFame(self.keep_top_k)

self._stats = tools.Statistics(lambda ind: ind.fitness.values)
self._stats = tools.Statistics(getter_fittness_values)
self._stats.register("fitness", np.mean)
self._stats.register("fitness_std", np.std)
self._stats.register("fitness_max", np.max)
Expand Down Expand Up @@ -540,8 +540,8 @@ def fit(self, X, y, callbacks=None):
for k in range(len(self._hof))
}

del self.creator.FitnessMax
del self.creator.Individual
del creator.FitnessMax
del creator.Individual

return self

Expand Down Expand Up @@ -878,7 +878,7 @@ def __init__(
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
self.creator = creator
# self.creator = creator
self.log_config = log_config

# Check that the estimator is compatible with scikit-learn
Expand All @@ -902,8 +902,8 @@ def _register(self):

# Criteria sign to set max or min problem
# And -1.0 as second weight to minimize number of features
self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, -1.0])
self.creator.create("Individual", list, fitness=creator.FitnessMax)
creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, -1.0])
creator.create("Individual", list, fitness=creator.FitnessMax)

# Register the array to choose the features
# Each binary value represents if the feature is selected or not
Expand Down Expand Up @@ -933,7 +933,7 @@ def _register(self):

# Stats among axis 0 to get two values:
# One based on the score and the other in the number of features
self._stats = tools.Statistics(lambda ind: ind.fitness.values)
self._stats = tools.Statistics(getter_fittness_values)
self._stats.register("fitness", np.mean, axis=0)
self._stats.register("fitness_std", np.std, axis=0)
self._stats.register("fitness_max", np.max, axis=0)
Expand Down Expand Up @@ -1104,8 +1104,8 @@ def fit(self, X, y, callbacks=None):

self.hof = self._hof

del self.creator.FitnessMax
del self.creator.Individual
del creator.FitnessMax
del creator.Individual

return self

Expand Down Expand Up @@ -1365,3 +1365,10 @@ def score(self, X, y):
``best_estimator_.score`` method otherwise.
"""
return self.estimator.score(self.transform(X), y)


# helpers


def getter_fittness_values(ind):
return ind.fitness.values
105 changes: 105 additions & 0 deletions sklearn_genetic/tests/test_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pytest
from sklearn.datasets import load_iris, load_diabetes
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import make_scorer
import numpy as np

from .. import GAFeatureSelectionCV
from ..callbacks import (
ThresholdStopping,
DeltaThreshold,
ConsecutiveStopping,
TimerStopping,
ProgressBar,
)
from ..schedules import ExponentialAdapter, InverseAdapter
from joblib import dump, load
import os


data = load_iris()
label_names = data["target_names"]
y = data["target"]
X = data["data"]

noise = np.random.uniform(1, 4, size=(X.shape[0], 10))

X = np.hstack((X, noise))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


def test_estimator_serialization():
clf = SGDClassifier(loss="modified_huber", fit_intercept=True)
generations = 6
evolved_estimator = GAFeatureSelectionCV(
clf,
cv=3,
scoring="accuracy",
population_size=6,
generations=generations,
tournament_size=3,
elitism=False,
keep_top_k=4,
verbose=False,
algorithm="eaSimple",
n_jobs=-1,
return_train_score=True,
)

evolved_estimator.fit(X_train, y_train)
dump_file = "evolved_estimator.pkl"

# test dump
assert dump(evolved_estimator, dump_file)[0] == dump_file

# load
dumped_estimator = load(dump_file)
features = dumped_estimator.support_

assert check_is_fitted(dumped_estimator) is None
assert features.shape[0] == X.shape[1]
assert len(dumped_estimator) == generations + 1 # +1 random initial population
assert len(dumped_estimator.predict(X_test)) == len(X_test)
assert dumped_estimator.score(X_train, y_train) >= 0
assert len(dumped_estimator.decision_function(X_test)) == len(X_test)
assert len(dumped_estimator.predict_proba(X_test)) == len(X_test)
assert len(dumped_estimator.predict_log_proba(X_test)) == len(X_test)
assert dumped_estimator.score(X_test, y_test) == accuracy_score(
y_test, dumped_estimator.predict(X_test)
)
assert bool(dumped_estimator.get_params())
assert len(dumped_estimator.hof) == dumped_estimator.keep_top_k
assert "gen" in dumped_estimator[0]
assert "fitness_max" in dumped_estimator[0]
assert "fitness" in dumped_estimator[0]
assert "fitness_std" in dumped_estimator[0]
assert "fitness_min" in dumped_estimator[0]

cv_results_ = dumped_estimator.cv_results_
cv_result_keys = set(cv_results_.keys())

assert "split0_test_score" in cv_result_keys
assert "split1_test_score" in cv_result_keys
assert "split2_test_score" in cv_result_keys
assert "split0_train_score" in cv_result_keys
assert "split1_train_score" in cv_result_keys
assert "split2_train_score" in cv_result_keys
assert "mean_test_score" in cv_result_keys
assert "std_test_score" in cv_result_keys
assert "rank_test_score" in cv_result_keys
assert "mean_train_score" in cv_result_keys
assert "std_train_score" in cv_result_keys
assert "rank_train_score" in cv_result_keys
assert "std_fit_time" in cv_result_keys
assert "mean_score_time" in cv_result_keys
assert "rank_n_features" in cv_result_keys
assert "features" in cv_result_keys

# delete dumped estimator
os.remove(dump_file)

0 comments on commit 1fd975f

Please sign in to comment.