-
Notifications
You must be signed in to change notification settings - Fork 0
/
pr7.py
120 lines (86 loc) · 3.84 KB
/
pr7.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
from sklearn.metrics import f1_score
import time
def display(_):
pass
class P7:
data: pd.DataFrame
x_train = None
x_test = None
y_train = None
y_test = None
@staticmethod
def t1():
from sklearn.model_selection import train_test_split
P7.data = pd.read_csv('penguins_size.csv')
print(P7.data)
P7.data = P7.data.dropna()
cat_columns = P7.data.select_dtypes(['object']).columns
# noinspection PyShadowingNames
P7.data[cat_columns] = P7.data[cat_columns].apply(lambda x: pd.factorize(x)[0])
print(P7.data)
x = P7.data.drop(['species'], axis=1)
y = P7.data['species']
P7.x_train, P7.x_test, P7.y_train, P7.y_test \
= train_test_split(x, y, test_size=.2, shuffle=True, random_state=59)
print(
'\n',
'Size of Predictor Train set', P7.x_train.shape, '\n',
'Size of Predictor Test set', P7.x_test.shape, '\n',
'Size of Target Train set', P7.y_train.shape, '\n',
'Size of Target Test set', P7.y_test.shape
)
@staticmethod
def t2():
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# regular tree
start_time = time.time()
random_forest = RandomForestClassifier(max_depth=15, min_samples_split=10).fit(P7.x_train, P7.y_train)
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Elapsed time: {elapsed_time} seconds')
# noinspection PyUnresolvedReferences
train_predictions = random_forest.predict(P7.x_train)
print('F1 metric for training set', f1_score(train_predictions, P7.y_train, average='macro'))
# noinspection PyUnresolvedReferences
test_predictions = random_forest.predict(P7.x_test)
print('F1 metric for test set', f1_score(test_predictions, P7.y_test, average='macro'))
# bagging (aka tuning for trees)
random_forest = RandomForestClassifier()
params_grid = {
'max_depth': [12, 18],
'min_samples_leaf': [3, 10],
'min_samples_split': [6, 12],
}
start_time = time.time()
grid_search_random_forest = GridSearchCV(
estimator=random_forest, param_grid=params_grid, scoring='f1_macro', cv=5
)
grid_search_random_forest.fit(P7.x_train, P7.y_train)
end_time = time.time()
elapsed_time = end_time - start_time
print(f'\nElapsed time: {elapsed_time} seconds')
best_model = grid_search_random_forest.best_estimator_
train_predictions = best_model.predict(P7.x_train)
print('F1 metric for training set', f1_score(train_predictions, P7.y_train, average='macro'))
test_predictions = best_model.predict(P7.x_test)
print('F1 metric for test set', f1_score(test_predictions, P7.y_test, average='macro'))
@staticmethod
def t3():
import catboost as cb
start_time = time.time()
model_boost = cb.CatBoostClassifier(iterations=3000, task_type='CPU', devices='0')
# 'GPU' works only on GPUs with CUDA cores - can be solved with running within Google Colab
model_boost.fit(P7.x_train, P7.y_train)
end_time = time.time()
elapsed_time = end_time - start_time
print(f'\nElapsed time: {elapsed_time} seconds')
train_predictions_boosted = model_boost.predict(P7.x_train, task_type='CPU')
print('Boosted F1 metric for train set', f1_score(train_predictions_boosted, P7.y_train, average='macro'))
test_predictions_boosted = model_boost.predict(P7.x_test, task_type='CPU')
print('Boosted F1 metric for test set', f1_score(test_predictions_boosted, P7.y_test, average='macro'))
if __name__ == '__main__':
P7.t1()
P7.t2()
P7.t3()