-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRegression_Q2_V3.py
124 lines (98 loc) · 4.02 KB
/
Regression_Q2_V3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from Regress_prep_loo import *
# X = pd.DataFrame(X)
# y = pd.DataFrame(y)
# X.info()
# y.info()
alphas = np.logspace(-3,4, num=1500)
cv = 10
#Two layer cross validation regularization
def nested_cross_validation(X=X, y=y, inner=10, outer=10, alphas=alphas):
"""
return best alpha for each outer fold
"""
model = Ridge()
scoring ="neg_mean_squared_error"
working = 1
cv_outer = ShuffleSplit(n_splits=outer, test_size=0.5, random_state=0)
cv_inner = ShuffleSplit(n_splits=inner, test_size=0.5, random_state=0)
best_alphas = []
best_errors = []
for train_index, test_index in cv_outer.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
cv_scores = []
for alpha in alphas:
print("Working on fold", working, "/", len(alphas)*outer)
model.set_params(alpha=alpha)
scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=cv_inner, return_train_score=False)
cv_scores.append(scores["test_score"])
working += 1
cv_scores = np.array(cv_scores)
best_alpha_index = np.argmax(np.mean(cv_scores, axis=0))
best_error = np.max(np.mean(cv_scores, axis=0))
best_alphas.append(alphas[best_alpha_index])
best_errors.append(best_error)
return best_alphas, best_errors
best_alphas, best_errors = nested_cross_validation() #return also inner folds
print(best_alphas)
# print(np.mean(np.multiply(-1, np.subtract(10,np.multiply(-1, best_errors)))))
print(np.mean(best_errors))
# def cross_validation(X=X, y=y, alphas=alphas, cv=cv):
# """estimate generalization error"""
# n_alphas = len(alphas)
# error_train = np.zeros(n_alphas)
# error_test = np.zeros(n_alphas)
# for i, alpha in enumerate(alphas):
# model = Ridge(alpha=alpha)
# cv_error = cross_validate(model, X, y, cv=cv, scoring="neg_mean_squared_error", return_train_score=True)
# error_train[i] = np.mean(cv_error["train_score"])
# error_test[i] = np.mean(cv_error["test_score"])
# return error_train, error_test
# error_train, error_test = cross_validation()
# print(-error_test, -error_train)
# def plot_error(error_test, alphas):
# """plot generalization error"""
# plt.plot(alphas, error_test)
# plt.xlabel('alpha')
# plt.ylabel('error')
# plt.xscale('log')
# plt.show()
# plot_error( error_test, alphas)
# w = list()
# for a in alphas:
# ridge_clf = RidgeCV(alphas=[a],cv=10).fit(X, y)
# w.append(ridge_clf.coef_)
# w = np.array(w)
# plt.semilogx((alphas),w)
# plt.title('Ridge coefficients as function of the regularization')
# plt.xlabel('Regularization factor')
# plt.ylabel('Mean Coefficient Values')
# plt.legend(attributeNames, prop={'size': 6})
# plt.savefig('images/regress_Q2_weigths.pdf',bbox_inches = 'tight')
# param = {
# 'alpha':np.logspace(-1, 5, num=150),
# 'fit_intercept':[True,False],
# 'normalize':[True,False],
# 'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
# }
# #define model
# model = Ridge()
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define search
# search = GridSearchCV(model, param, scoring='r2', n_jobs=-1, cv=cv)
# # execute search
# result = search.fit(X, y)
# # summarize result
# print('Best Score: %s' % result.best_score_)
# print('Best Hyperparameters: %s' % result.best_params_)