-
Notifications
You must be signed in to change notification settings - Fork 1
/
tree.py
204 lines (155 loc) · 6.85 KB
/
tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import pandas as pd
import numpy as np
from node import *
from UNode import *
from PNode import *
from FNode import *
class Tree:
def __init__(self, criterium, max_depth=8, min_samples_split=10,
most_mass_threshold=0.9, min_mass_threshold=0.0127,
min_weight_threshold=0.0, parallel=None, n_jobs=1):
self.root = []
self.criterium = criterium
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.most_mass_threshold = most_mass_threshold
self.min_mass_threshold = min_mass_threshold
self.min_weight_threshold = min_weight_threshold
self.parallel = parallel
self.n_jobs = n_jobs
# recibe un set de entrenamiento y ajusta el arbol
def fit(self, data, y):
if self.criterium == 'gain':
self.root = Node(level=1, max_depth=self.max_depth,
min_samples_split=self.min_samples_split)
self.root.fit(data, y)
elif self.criterium == 'uncertainty' and self.parallel is None:
self.root = UNode(level=1, max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
most_mass_threshold=self.most_mass_threshold,
min_mass_threshold=self.min_mass_threshold)
data['class'] = y
self.root.fit(data)
elif self.criterium == 'uncertainty' and self.parallel == 'splits':
self.root = PNode(level=1, max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
most_mass_threshold=self.most_mass_threshold,
min_mass_threshold=self.min_mass_threshold, n_jobs=self.n_jobs)
data['class'] = y
self.root.fit(data)
elif self.criterium == 'uncertainty' and self.parallel == 'features':
self.root = FNode(level=1, max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
most_mass_threshold=self.most_mass_threshold,
min_mass_threshold=self.min_mass_threshold, n_jobs=self.n_jobs)
data['class'] = y
self.root.fit(data)
def predict(self, tupla):
"""Returnes the predicted class of a tuple"""
if self.criterium != 'uncertainty':
return self.root.predict(tupla)
else:
# diccionario con las clases y sus probabilidades
prediction = self.root.predict(tupla)
# Busco la clase con la mayor probabilidad y su probabilidad
maximo = max(prediction.values())
clase = None
for key in prediction.keys():
if maximo == prediction[key]:
clase = key
return clase, maximo
def show(self):
"""Prints the tree structure"""
self.root.show()
def predict_table(self, frame, y):
"""Returnes the original class, the prediction and its probability
It serves as a testing mechanism of the performance of a classifier
Parameters
----------
frame: Dataframe of the data that must be classified. Each row is an
object and each column is a feature
y: Real classes of the data that must be classified
"""
# Creo el frame e inserto la clase
tabla = []
for index, row in frame.iterrows():
clase = y[index]
predicted, confianza = self.predict(row)
tabla.append([clase, predicted, confianza])
return pd.DataFrame(tabla, index=frame.index,
columns=['original', 'predicted', 'trust'])
def confusion_matrix(self, table):
"""Generates a confusion matrix from the prediction table"""
unique = np.unique(np.concatenate((table['original'].values,
table['predicted'].values), axis=1))
matrix = np.zeros((len(unique), len(unique)))
matrix = pd.DataFrame(matrix)
matrix.columns = unique
matrix.index = unique
for index, row in table.iterrows():
matrix[row[0]][row[1]] += row[2]
return matrix
def hard_matrix(self, table):
"""Generates a hard_confusion matrix for probabilistic classifiers"""
unique = np.unique(np.concatenate((table['original'].values, table['predicted'].values)))
matrix = np.zeros((len(unique), len(unique)))
matrix = pd.DataFrame(matrix)
matrix.columns = unique
matrix.index = unique
for index, row in table.iterrows():
matrix[row[0]][row[1]] += 1
return matrix
def precision(self, matrix, clase):
"""Shows the accuracy of a given class, based on a confusion matrix"""
if clase in matrix.columns:
correctos = matrix[clase].loc[clase]
total = matrix[clase].sum()
return correctos / total
# A negative number is returned to show that there are no predictions
# of the given class on the confusion matrix
else:
return -1
def recall(self, matrix, clase):
"""Shows the recall of a given class, based on a confusion matrix"""
if clase in matrix.columns:
reconocidos = matrix[clase].loc[clase]
total = matrix.loc[clase].sum()
return reconocidos / total
# A negative number is returned to show that there are no predictions
# of the given class on the confusion matrix
else:
return -1
def f_score(self, matrix, clase):
"""Shows the f_score of a given class, based on a confusion matrix"""
acc = self.precision(matrix, clase)
rec = self.recall(matrix, clase)
# Neccesary check, in order to avoid divisions by zero
if acc == 0 or rec == 0:
return 0
# Check that both are valid
elif acc == -1 or rec == -1:
return -1
# Retorno f_score
else:
return 2 * acc * rec / (acc + rec)
def weighted_f_score(self, matrix):
clases = matrix.columns.tolist()
counts = {c: matrix.loc[c].sum() for c in clases}
f_scores = {c: self.f_score(matrix, c) for c in clases}
total = sum(counts.values())
ret = 0
for c in clases:
ret += counts[c] / total * f_scores[c]
return ret
def get_splits(self):
splits = {}
node_list = [self.root]
while node_list:
# Saco un elemento de la lista
node = node_list.pop(0)
if not node.is_leaf:
split_feat = node.feat_name
splits.setdefault(split_feat, []).append(node.feat_value)
node_list.append(node.right)
node_list.append(node.left)
return splits