-
Notifications
You must be signed in to change notification settings - Fork 1
/
GridSearchCV_norefit.py
178 lines (146 loc) · 7.35 KB
/
GridSearchCV_norefit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from joblib import Parallel, delayed
from sklearn.model_selection import GridSearchCV
from sklearn.utils.validation import indexable, _check_fit_params
from sklearn.model_selection._validation import _fit_and_score
from sklearn.model_selection._split import check_cv
from sklearn.base import clone, is_classifier
from sklearn.metrics._scorer import _check_multimetric_scoring
from sklearn.metrics import check_scoring
from itertools import product
import numpy as np
from sklearn.model_selection._validation import _warn_or_raise_about_fit_failures, _insert_error_scores
from collections import defaultdict
class GridSearchCV_norefit(GridSearchCV):
def __init__(self, estimator, param_grid, scoring=None,
n_jobs=None, refit=False, cv=None,
verbose=0, pre_dispatch='2*n_jobs',
error_score=np.nan, return_train_score=False):
super().__init__(estimator, param_grid, scoring=scoring,
n_jobs=n_jobs, refit=False, cv=cv,
verbose=verbose, pre_dispatch=pre_dispatch,
error_score=error_score, return_train_score=return_train_score)
def fit(self, X, y=None, groups=None, **fit_params):
"""Run fit with all sets of parameters.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like of shape (n_samples, n_output) or (n_samples,), optional
Target relative to X for classification or regression;
None for unsupervised learning.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Only used in conjunction with a "Group" :term:`cv`
instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of the estimator
"""
estimator = self.estimator
refit_metric = "score"
if callable(self.scoring):
scorers = self.scoring
elif self.scoring is None or isinstance(self.scoring, str):
scorers = check_scoring(self.estimator, self.scoring)
else:
scorers = _check_multimetric_scoring(self.estimator, self.scoring)
self._check_refit_for_multimetric(scorers)
refit_metric = self.refit
X, y, groups = indexable(X, y, groups)
fit_params = _check_fit_params(X, fit_params)
cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
n_splits = cv_orig.get_n_splits(X, y, groups)
base_estimator = clone(self.estimator)
parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)
fit_and_score_kwargs = dict(scorer=scorers,
fit_params=fit_params,
return_train_score=self.return_train_score,
return_n_test_samples=True,
return_times=True,
return_parameters=False,
error_score=self.error_score,
verbose=self.verbose,
return_estimator=True) # added
results = {}
best_estimator = None
with parallel:
all_candidate_params = []
all_out = []
all_more_results = defaultdict(list)
def evaluate_candidates(candidate_params, cv=None, more_results=None):
cv = cv or cv_orig
candidate_params = list(candidate_params)
n_candidates = len(candidate_params)
if self.verbose > 0:
print(
"Fitting {0} folds for each of {1} candidates,"
" totalling {2} fits".format(
n_splits, n_candidates, n_candidates * n_splits
)
)
out = parallel(
delayed(_fit_and_score)(
clone(base_estimator),
X,
y,
train=train,
test=test,
parameters=parameters,
split_progress=(split_idx, n_splits),
candidate_progress=(cand_idx, n_candidates),
**fit_and_score_kwargs,
)
for (cand_idx, parameters), (split_idx, (train, test)) in product(
enumerate(candidate_params), enumerate(cv.split(X, y, groups))
)
)
if len(out) < 1:
raise ValueError(
"No fits were performed. "
"Was the CV iterator empty? "
"Were there no candidates?"
)
elif len(out) != n_candidates * n_splits:
raise ValueError(
"cv.split and cv.get_n_splits returned "
"inconsistent results. Expected {} "
"splits, got {}".format(n_splits, len(out) // n_candidates)
)
_warn_or_raise_about_fit_failures(out, self.error_score)
# For callable self.scoring, the return type is only know after
# calling. If the return type is a dictionary, the error scores
# can now be inserted with the correct key. The type checking
# of out will be done in `_insert_error_scores`.
if callable(self.scoring):
_insert_error_scores(out, self.error_score)
best_index = np.argmax([s['test_scores'] for s in out])
nonlocal best_estimator
best_estimator = out[best_index]['estimator']
for o in out: # free up memory
del o['estimator']
all_candidate_params.extend(candidate_params)
all_out.extend(out)
if more_results is not None:
for key, value in more_results.items():
all_more_results[key].extend(value)
nonlocal results
results = self._format_results(
all_candidate_params, n_splits, all_out, all_more_results
)
return results
self._run_search(evaluate_candidates)
self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
self.best_score_ = results["mean_test_%s" % refit_metric][self.best_index_]
self.best_params_ = results["params"][self.best_index_]
self.best_estimator_ = best_estimator
# Store the only scorer not as a dict for single metric evaluation
self.scorer_ = scorers
self.cv_results_ = results
self.n_splits_ = n_splits
return self
def predict(self, X):
return self.best_estimator_.predict(X)
def predict_proba(self, X):
return self.best_estimator_.predict_proba(X)
def transform(self, X):
return self.best_estimator_.transform(X)