-
Notifications
You must be signed in to change notification settings - Fork 0
/
Regression with Scikit-learn
294 lines (232 loc) · 9.82 KB
/
Regression with Scikit-learn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
Import pandas as pd
Import matplotlib as plt
Import numpy as np
From sklearn import linear_model
data=pd.read_csv(‘boston.csv’)
#view data
print(data.head())
#target variable is Medv, return numpy arrays
y=data[‘MEDV’].values
X=data.drop(‘MEDV’,axis=1).values
#regressing on number of rooms (index position column 5)
NumberRooms=X[:,5]
#figure out what type it is
type(NumberRooms), type(y)
#both are numpy arrays
#make them the right shape
NumberRooms=NumberRooms.reshape(-1,1)
y=y.reshape(-1,1)
#lets plot
plt.scatter(NumberRooms,y)
plt.ylabel(“Value of house per /1000 $”)
plt.xlabel(“Number rooms”)
plt.show();
#fit model
model=linear_model.LinearRegression()
model.fit(NumberRooms,y)
#make a cool plot for prediction cause yolo
XAxis=np.linspace(min(NumberRooms),max(NumberRooms)).reshape(-1,1)
plt.scatter(NumberRooms,y,color=’blue’)
plt.plot(XAxis,model.predict(XAxis),color=’black’,linewidth=3)
plt.show()
#---------------------------------------------------------
#More practice with fertility rates and life expectancy
# Import numpy and pandas
import numpy as np
import pandas as pd
# Read the CSV file into a DataFrame: df
df = pd.read_csv("gapminder.csv")
# Create arrays for features and target variable
y = df['life'].values
X = df['fertility'].values
# Print the dimensions of X and y before reshaping
print("Dimensions of y before reshaping: {}".format(y.shape))
print("Dimensions of X before reshaping: {}".format(X.shape))
# Reshape X and y
y = y.reshape(-1,1)
X = X.reshape(-1,1)
# Print the dimensions of X and y after reshaping
print("Dimensions of y after reshaping: {}".format(y.shape))
print("Dimensions of X after reshaping: {}".format(X.shape))
#make a correlation heatmap
#the heatmap was generated using Seaborn's heatmap function and the following line of code, where df.corr() computes the pairwise correlation between columns:
sns.heatmap(df.corr(), square=True, cmap='RdYlGn')
#explore the DataFrame using pandas methods such as .info(), .describe(), .head().
#---------------------------------------------------------
From sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
model=linear_model.LinearRegression()
model.fit(X_train,y_train)
Xhat=model.predict(X_test)
#find out how good model is with R^2
model.score(X_test,y_test)
#---------------------------------------------------------
# more regression
# Import LinearRegression
from sklearn.linear_model import LinearRegression
# Create the regressor: reg
reg = LinearRegression()
# Create the prediction space
prediction_space = np.linspace(min(X_fertility), max(X_fertility)).reshape(-1,1)
# Fit the model to the data
reg.fit(X_fertility,y)
# Compute predictions over the prediction space: y_pred
y_pred = reg.predict(prediction_space)
# Print R^2
print(reg.score(X_fertility,y))
# Plot regression line
plt.plot(prediction_space, y_pred, color='black', linewidth=3)
plt.show()
#---------------------------------------------------------
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Create training and test sets so that 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
# Create the regressor: reg_all
reg_all = LinearRegression()
# Fit the regressor to the training data
reg_all.fit(X_train,y_train)
# Predict on the test data: y_pred
y_pred = reg_all.predict(X_test)
# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))
#---------------------------------------------------------
#Cross validation
#Motivation: Model performance is dependant on the way the data is split, may not be representative of the models ability to generalize to unseen data
#Begin by splitting the data into 5 parts,or folds
#use the first fold as the test sample, use other 4 as training
#do this for each fold
#this gives you 5 metrics of comparison for goodness of fit
#this is called 5-fold cross validation,could be 10-fold etc k-fold but more folds is more computationally expensive
From sklearn.model_selection import cross_val_score
model=linear_model.LinearRegression()
CV_RSquare=cross_val_score(model,X,y,cv=5)
print(CV_RSquare)
np.mean(CV_RSquare)
#---------------------------------------------------------
# Import the necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
# Create a linear regression object: reg
reg = LinearRegression()
# Compute 5-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(reg,X,y,cv=5)
# Print the 5-fold cross-validation scores
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
#---------------------------------------------------------
#In the IPython Shell, you can use %timeit to see how long each 3-fold CV takes compared to 10-fold CV by executing the following cv=3 and cv=10:
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
# Create a linear regression object: reg
reg = LinearRegression()
# Perform 3-fold CV
cvscores_3 = cross_val_score(reg,X,y,cv=3)
#in the shell, do %timeit cross_val_score(reg,X,y,cv=3)
print(np.mean(cvscores_3))
# Perform 10-fold CV
cvscores_10 = cross_val_score(reg,X,y,cv=10)
print(np.mean(cvscores_10))
#---------------------------------------------------------
#Regulariation
#large coefficients can lead to overfitting, thus penalize for large coefficients
#Sum squared residuals + some constant alpha times the sum of the squared coefficients
#thus, large coefficients = large penalty
#we choose alpha, much like choosing k in knn in classification. This is called hyperparameter tuning
#alpha(sometimes called lambda controls model complexity
#when alpha=0, we get back to standard OLS (which can lead to overfitting)
#when alpha approaches infinity, this can lead to underfitting because large coefficients are significantly penalized and leads to too simple of a model
From sklearn.linear_model import Ridge
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
ridge=Ridge(alpha=0.1,normalize=True)
ridge.fit(X_train,y_train)
Xhat=ridge.predict(X_test)
ridge.score(X_test,y_test)
#It can be used to select very important features of a dataset
#tends to shrink the value of less important features
from sklearn.model_selection import Lasso to 0
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
lasso=Lasso(alpha=0.1,normalize=True)
lasso.fit(X_train,y_train)
lasso.predict(X_test)
lasso.score(X_test,y_test)
#Lasso for feature selection i.e independent variable selection
from sklearn.model_selection import Lasso
columns=data.drop(“Medv”,axis=1).columns
lasso=Lasso(alpha=0.1)
lasso_coeff=lasso.fit(X,y).coef_
#plotting coefficients as a function of feature name
_=plt.plot(range(len(columns),lasso_coeff)
_=plt.xticks(range(len(columns),columns,rotation=60)
_=plt.ylabel(‘Coefficients’)
plt.show()
#Most important predictor is #rooms
# Import Lasso
from sklearn.linear_model import Lasso
# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha=0.4, normalize=True)
# Fit the regressor to the data
lasso.fit(X, y)
# Compute and print the coefficients
lasso_coef = lasso.coef_
print(lasso_coef)
# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()
#-------------------------------------
#Lasso is great for feature selection, but when building regression models, Ridge regression should be your first choice.
#Recall that lasso performs regularization by adding to the loss function a penalty term of the absolute value of each coefficient multiplied by some alpha. This is also known as
L1
regularization because the regularization term is the
L1
norm of the coefficients. This is not the only way to regularize, however.
If instead you took the sum of the squared values of the coefficients multiplied by some alpha - like in Ridge regression - you would be computing the
L2
norm. In this exercise, you will practice fitting ridge regression models over a range of different alphas, and plot cross-validated
R
2
scores for each, using this function that we have defined for you, which plots the
R
2
score as well as standard error for each alpha:
def display_plot(cv_scores, cv_scores_std):
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(alpha_space, cv_scores)
std_error = cv_scores_std / np.sqrt(10)
ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
ax.set_ylabel('CV Score +/- Std Error')
ax.set_xlabel('Alpha')
ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
ax.set_xlim([alpha_space[0], alpha_space[-1]])
ax.set_xscale('log')
plt.show()
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []
# Create a ridge regressor: ridge
#this is an object
ridge = Ridge(normalize=True)
# Compute scores over range of alphas
for alpha in alpha_space:
# Specify the alpha value to use: ridge.alpha
ridge.alpha = ridge.alpha
# Perform 10-fold CV: ridge_cv_scores
ridge_cv_scores = cross_val_score(ridge,X,y,cv=10)
# Append the mean of ridge_cv_scores to ridge_scores
ridge_scores.append(np.mean(ridge_cv_scores))
# Append the std of ridge_cv_scores to ridge_scores_std
ridge_scores_std.append(np.std(ridge_cv_scores))
# Display the plot
display_plot(ridge_scores, ridge_scores_std)