-
Notifications
You must be signed in to change notification settings - Fork 0
/
AdaBoostClassifier.py
90 lines (73 loc) · 3.08 KB
/
AdaBoostClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
import random
from sklearn.tree import DecisionTreeClassifier
class AdaBoost:
def __init__(self, alpha_type=0, duplicate=False):
self.alphas = []
self.G_M = []
self.M = None
self.training_errors = []
self.prediction_errors = []
self.alpha_type = alpha_type
self.duplicate = duplicate
def fit(self, X, y, M=40):
# X: independent variables - array-like matrix
# y: target variable - array-like vector
# M: number of boosting rounds. Default is 100 - integer
self.alphas = []
self.G_M = []
self.training_errors = []
self.M = M
# percentage = 0.3 # Example percentage, adjust as needed
# desired_num_examples = int(len(X) * percentage)
choices = [True, False]
prob = 1
for m in range(0, M):
# print("Tamanho =", len(y))
if m == 0:
w_i = np.ones(len(y)) * 1 / len(y)
else:
w_i = update_weights(w_i, alpha_m, y, y_pred)
# to duplicate the examples in X whose prediction doesnt match the value in y
if (prob + 100 - prob > 0):
choice = random.choices(choices, weights=[prob, 1-prob], k=1)[0]
else:
choice = False
if choice and self.duplicate:
mask = (y_pred != y)
prob = np.sum(mask) / len(X)
X_mismatched = X[mask]
y_mismatched = y[mask]
w_i_mismatched = w_i[mask]
X = pd.concat([X, X_mismatched], axis=0)
y = pd.concat([y, y_mismatched], axis=0)
w_i = np.concatenate((w_i, w_i_mismatched))
G_m = DecisionTreeClassifier(max_depth = 1, max_features= 1)
G_m.fit(X, y, sample_weight = w_i)
y_pred = G_m.predict(X)
self.G_M.append(G_m)
error_m = compute_error(y, y_pred, w_i)
self.training_errors.append(error_m)
alpha_m = compute_alpha(error_m, alpha_type=self.alpha_type)
self.alphas.append(alpha_m)
assert len(self.G_M) == len(self.alphas)
def predict(self, X):
weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M))
for m in range(self.M):
y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
weak_preds.iloc[:,m] = y_pred_m
y_pred = (np.sign(weak_preds.T.sum())).astype(int)
return y_pred
# AUXILIAR FUNCTIONS TO THE ADABOOST CLASS
def compute_error(y, y_pred, w_i):
return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)
def compute_alpha(error, alpha_type=0):
# original
if alpha_type == 0:
return 0.5 * np.log((1 - error + 1e-10) / (error + 1e-10))
# new one - directly proportional to error
elif alpha_type == 1:
return error
def update_weights(w_i, alpha, y, y_pred):
return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))