-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRegression_Q2.py
128 lines (103 loc) · 5.04 KB
/
Regression_Q2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from Regress_prep_loo import *
# from Regression_preparation import *
#inspired by ex 6.2.1
# exercise 6.2.1
from matplotlib.pyplot import figure, plot, subplot, title, xlabel, ylabel, show, clim
from scipy.io import loadmat
import sklearn.linear_model as lm
from sklearn import model_selection
from toolbox_02450 import feature_selector_lr, bmplot
import numpy as np
# Load data from matlab file
## Crossvalidation
# Create crossvalidation partition for evaluation
K = 10
CV = model_selection.KFold(n_splits=K,shuffle=True)
# Initialize variables
Features = np.zeros((M,K))
Error_train = np.empty((K,1))
Error_test = np.empty((K,1))
Error_train_fs = np.empty((K,1))
Error_test_fs = np.empty((K,1))
Error_train_nofeatures = np.empty((K,1))
Error_test_nofeatures = np.empty((K,1))
k=0
for train_index, test_index in CV.split(X):
# extract training and test set for current CV fold
X_train = X[train_index,:]
y_train = y[train_index]
X_test = X[test_index,:]
y_test = y[test_index]
internal_cross_validation = 10
# Compute squared error without using the input data at all
Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]
# Compute squared error with all features selected (no feature selection)
m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]
# Compute squared error with feature subset selection
textout = ''
selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation,display=textout)
Features[selected_features,k] = 1
# .. alternatively you could use module sklearn.feature_selection
if len(selected_features) == 0:
print('No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' )
else:
m = lm.LinearRegression(fit_intercept=True).fit(X_train[:,selected_features], y_train)
Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]
figure(k)
subplot(1,2,1)
plot(range(1,len(loss_record)), loss_record[1:])
xlabel('Iteration')
ylabel('Squared error (crossvalidation)')
subplot(1,3,3)
bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:])
clim(-1.5,0)
xlabel('Iteration')
# plt.savefig('images/OneFold.pdf',bbox_inches = 'tight')
print('Cross validation fold {0}/{1}'.format(k+1,K))
print('Train indices: {0}'.format(train_index))
print('Test indices: {0}'.format(test_index))
print('Features no: {0}\n'.format(selected_features.size))
k+=1
# Display results
print('\n')
print('Linear regression without feature selection:\n')
print('- Training error: {0}'.format(Error_train.mean()))
print('- Test error: {0}'.format(Error_test.mean()))
print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
print('Linear regression with feature selection:\n')
print('- Training error: {0}'.format(Error_train_fs.mean()))
print('- Test error: {0}'.format(Error_test_fs.mean()))
print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum()))
print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum()))
figure(k)
subplot(1,3,2)
bmplot(attributeNames, range(1,Features.shape[1]+1), -Features)
clim(-1.5,0)
xlabel('Crossvalidation fold')
ylabel('Attribute')
# plt.savefig('images/All_folds.pdf',bbox_inches = 'tight')
# Inspect selected feature coefficients effect on the entire dataset and
# plot the fitted model residual error as function of each attribute to
# inspect for systematic structure in the residual
f=10 # cross-validation fold to inspect
ff=Features[:,f-1].nonzero()[0]
if len(ff) == 0:
print('\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' )
else:
m = lm.LinearRegression(fit_intercept=True).fit(X[:,ff], y)
y_est= m.predict(X[:,ff])
residual=y-y_est
figure(k+1, figsize=(12,6))
title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
for i in range(0,len(ff)):
subplot(2, int( np.ceil(len(ff) / 2)),i+1)
plot(X[:,ff[i]],residual,'.')
xlabel(attributeNames[ff[i]])
ylabel('residual error')
# plt.savefig('images/regress_Q2_general_err.pdf',bbox_inches = 'tight')
show()