-
Notifications
You must be signed in to change notification settings - Fork 0
/
rf_prediction_model.py
70 lines (50 loc) · 1.9 KB
/
rf_prediction_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
file_name = "election_data.csv"
# Set to working directory
#os.chdir()
data = pd.read_csv(file_name, header = 0, index_col='Year')
data = data.drop(columns= "Name")
X = data.iloc[:, :9]
y = data.iloc[:, 9]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# Set seed for reproducibility
SEED = 42
# Instantiate a random forests regressor 'rf'
rf = RandomForestRegressor(random_state= SEED)
# Inspect rf' s hyperparameters
rf.get_params()
# Define a grid of hyperparameter 'params_rf'
params_rf = {
'n_estimators': [300, 400, 500],
'max_depth': [4, 6, 8],
'min_samples_leaf': [0.1, 0.2],
'max_features': ['log2','sqrt']
}
# Instantiate 'grid_rf'
grid_rf = GridSearchCV(estimator=rf,param_grid=params_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
# Searching for the best hyperparameters
# Fit 'grid_rf' to the training set
grid_rf.fit(X_train, y_train)
# Extract best hyperparameters from 'grid_rf'
best_hyperparams = grid_rf.best_params_
print('Best hyperparameters:\n', best_hyperparams)
# Extract best model from 'grid_rf'
best_model = grid_rf.best_estimator_
# Predict the test set labels
y_pred = best_model.predict(X_test)
# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)
# Print the test set RMSE
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))
x_2020 = [[52387, 8.4, 1, 1, 1, 0, -0.9, -13.2, .0783]]
y_2020_pred = best_model.predict(x_2020)
print("2020 Pred:",y_2020_pred)