-
Notifications
You must be signed in to change notification settings - Fork 0
/
Example.py
157 lines (118 loc) · 5.12 KB
/
Example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
@author: Daniel Moreira de Sousa
@my github: https://github.com/DanielMSousa
"""
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
class model_evaluator:
def __init__(self):
#You can add more models here
self.models = [
#use this structure: (name, model())
('Linear regression', LinearRegression()),
('Multiple linear regression', LinearRegression()),
('SVR', SVR()),
('Decision Tree', DecisionTreeRegressor(random_state=42)),
('Random Forest', RandomForestRegressor(random_state=42)),
('KNN regressor', KNeighborsRegressor()),
]
self.cv_models = []
self.predictions = []
self.metrics = []
#You can change the grid search params here
#make sure the keys are the same name inside of self.models
self.grid_searchs = {
'SVR': {
'C':[0.1, 1, 5, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel':['rbf'],
'degree': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
},
'Random Forest': {
'n_estimators': [100, 150],
'random_state': [42]
},
'KNN regressor': {
'n_neighbors': [3, 4, 5, 8, 10],
}
}
def generate_metrics(self, name, model, X_test, y_test):
self.cv_models.append((name, model))
p = model.predict(X_test)
#you can add more evaluation metrics in this dict here
#y test are the correct values and p the predicted values
d = {
'Name': name,
'r2 score': r2_score(y_test, p),
'MRE': mean_squared_error(y_test, p),
'MRSE': mean_squared_error(y_test, p) ** (1/2),
'MAE': mean_absolute_error(y_test, p),
}
self.metrics.append(d)
def evaluate_models(self, X_train, X_test, y_train, y_test, cv=None, verb=0):
for name, model in self.models:
#This does only the simple linear regression
#Using only the most correlated column
if name == 'Linear regression':
y_train.columns = ['target']
fd = pd.concat([X_train, y_train], axis=1)
corr = fd.corr()['target'].drop('target')
a = np.absolute(corr.values).max()
ind = corr[(corr == a) | (corr == -a)].index[0]
lr_train = X_train[ind]
model = LinearRegression()
model.fit(lr_train.values.reshape(-1, 1), y_train)
self.generate_metrics(name, model, X_test[ind].values.reshape(-1, 1), y_test)
continue
#this is where we do the grid search cross-validation
if(cv==True and name in self.grid_searchs.keys()):
print(f'Cross validating {name}, it may take some time!')
m = GridSearchCV(model, param_grid = self.grid_searchs[name], refit=True, verbose=verb)
m.fit(X_train, y_train.values.ravel())
self.generate_metrics(name+'_cv', m, X_test, y_test)
model.fit(X_train, y_train.values.ravel())
self.generate_metrics(name, model, X_test, y_test)
self.models.extend(self.cv_models)
self.metrics = pd.DataFrame(self.metrics)
self.metrics = self.metrics.set_index('Name').sort_values(by=['r2 score'], ascending=False)
#This method returns the model that had the best R2 score in the evaluation
def select_best_r2(self):
best = self.metrics[self.metrics['r2 score'] == self.metrics['r2 score'].max()]
print(best)
for model in self.models:
if model[0] == best.index:
return model[1]
# In this example we'll be using the boston housing dataset
from sklearn.datasets import load_boston
data = load_boston()
X = pd.DataFrame(data['data'], columns=data.feature_names)
y = pd.DataFrame(data['target'], columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit_transform(X_test)
#Without grid search
tester = model_evaluator()
tester.evaluate_models(X_train, X_test, y_train, y_test)
print(tester.metrics)
print()
model = tester.select_best_r2()
print(model)
print()
#With grid search
tester2 = model_evaluator()
#only use this if you have a lot of processing and RAM!
tester2.evaluate_models(X_train, X_test, y_train, y_test, cv=True)
print(tester2.metrics)
print()
model2 = tester2.select_best_r2()
print(model2)