-
Notifications
You must be signed in to change notification settings - Fork 14
/
classification.py
executable file
·265 lines (201 loc) · 11.2 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""
This module declares the Bayesian classification tree models:
* PerpendicularClassificationTree
* HyperplaneClassificationTree
"""
import numpy as np
from abc import ABC
from sklearn.base import ClassifierMixin
from bayesian_decision_tree.base import BaseTree
from bayesian_decision_tree.base_hyperplane import BaseHyperplaneTree
from bayesian_decision_tree.base_perpendicular import BasePerpendicularTree
from bayesian_decision_tree.utils import multivariate_betaln
class BaseClassificationTree(BaseTree, ABC, ClassifierMixin):
"""
Abstract base class of all Bayesian classification trees (perpendicular and hyperplane). Performs
medium-level fitting and prediction tasks and outsources the low-level work to subclasses.
"""
def __init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level=0):
BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, split_precision, level)
def predict_proba(self, X):
"""Predict class probabilities of the input samples X.
Parameters
----------
X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features]
The input samples.
Returns
-------
p : array of shape = [n_samples, n_classes]
The class probabilities of the input samples.
"""
# input transformation and checks
X, _ = self._normalize_data_and_feature_names(X)
self._ensure_is_fitted(X)
y_proba = [None] * X.shape[0]
self._predict(X, np.arange(X.shape[0]), False, y_proba)
return np.array(y_proba)
def _check_target(self, y):
if y.ndim != 1:
raise ValueError('y should have 1 dimension but has {}'.format(y.ndim))
n_classes = len(self.prior)
if not np.all(np.unique(y) == np.arange(0, n_classes)):
raise ValueError('Expected target values 0..{} but found {}..{}'.format(n_classes - 1, y.min(), y.max()))
def _get_prior(self, n_data, n_dim):
if self.prior is not None:
return self.prior
else:
prior_pseudo_observation_count = max(1, n_data//100)
return prior_pseudo_observation_count * np.ones(n_dim)
def _compute_log_p_data_no_split(self, y, prior):
posterior = self._compute_posterior(y, prior)
log_p_prior = np.log(1-self.partition_prior**(1+self.level))
log_p_data = multivariate_betaln(posterior) - multivariate_betaln(prior)
return log_p_prior + log_p_data
def _compute_log_p_data_split(self, y, prior, n_dim, split_indices):
n_classes = len(prior)
k1 = np.empty(n_classes, dtype=object)
k2 = np.empty(n_classes, dtype=object)
for i in range(n_classes):
k1_and_total = (y == i).cumsum()
total = k1_and_total[-1]
k1[i] = k1_and_total[split_indices-1]
k2[i] = total - k1[i]
n_splits = len(split_indices)
log_p_prior = np.log(self.partition_prior**(1+self.level) / (n_splits * n_dim))
betaln_prior = multivariate_betaln(prior)
log_p_data1 = self._compute_log_p_data(k1, prior, betaln_prior)
log_p_data2 = self._compute_log_p_data(k2, prior, betaln_prior)
return log_p_prior + log_p_data1 + log_p_data2
def _compute_posterior(self, y, prior, delta=1):
if delta == 0:
return prior
# see https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions
y_reshaped = np.broadcast_to(y, (len(prior), len(y)))
classes = np.arange(len(prior)).reshape(-1, 1)
k = np.sum(y_reshaped == classes, axis=1)
posterior = prior + delta*k
return posterior
def _compute_posterior_mean(self):
return self.posterior_ / np.sum(self.posterior_)
def _compute_log_p_data(self, k, prior, betaln_prior):
# see https://www.cs.ubc.ca/~murphyk/Teaching/CS340-Fall06/reading/bernoulli.pdf, equation (42)
# which can be expressed as a fraction of beta functions
return multivariate_betaln(prior+k) - betaln_prior
def _predict_leaf(self):
# predict class
return np.argmax(self.posterior_)
def _get_raw_leaf_data_internal(self):
# prior and posterior raw data
return np.array([self.prior, self.posterior_])
class PerpendicularClassificationTree(BasePerpendicularTree, BaseClassificationTree):
"""
Bayesian binary or multiclass classification tree. Uses a Dirichlet prior (a
multivariate generalization of the Beta prior for more than 2 variables).
Parameters
----------
partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9
The prior probability of splitting a node's data into two children.
Small values tend to reduce the tree depth, leading to less expressiveness
but also to less overfitting.
Large values tend to increase the tree depth and thus lead to the tree
better fitting the data, which can lead to overfitting.
prior : array_like, shape = [number of classes]
The hyperparameters [alpha_0, alpha_1, ..., alpha_{N-1}] of the Dirichlet
conjugate prior, see [1] and [2]. All alpha_i must be positive, where
alpha_i represents the number of prior pseudo-observations of class i.
Small values for alpha_i represent a weak prior which leads to the
training data dominating the posterior. This can lead to overfitting.
Large values for alpha_i represent a strong prior and thus put less weight
on the data. This can be used for regularization.
delta : float, default=0.0
Determines the strengthening of the prior as the tree grows deeper,
see [1]. Must be a value between 0.0 and 1.0.
prune : boolean, default=False
Prunes the tree after fitting if `True` by removing all splits that don't add information,
i.e., where the predictions of both children are identical. It's usually sensible to set
this to `True` in the classification case if you're only interested in class predictions
(`predict(X)`), but it makes sense to set it to `False` if you're looking for class
probabilities (`predict_proba(X)`). It can safely be set to 'True' in the regression case
because it will only merge children if their predictions are identical.
split_precision : float, default=0.0
Determines the minimum distance between two contiguous points to consider a split. If the distance is below
this threshold, the points are considered to overlap along this direction.
level : DO NOT SET, ONLY USED BY SUBCLASSES
See also
--------
demo_classification_perpendicular.py
PerpendicularRegressionTree
HyperplaneClassificationTree
References
----------
.. [1] https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical/multinomial
.. [2] https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions
Examples
--------
See `demo_classification_perpendicular.py`.
"""
def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, split_precision=0.0, level=0):
child_type = PerpendicularClassificationTree
BasePerpendicularTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, split_precision, level)
BaseClassificationTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level)
class HyperplaneClassificationTree(BaseHyperplaneTree, BaseClassificationTree):
"""
Bayesian binary or multiclass classification tree using arbitrarily-oriented
hyperplane splits. Uses a Dirichlet prior (a multivariate generalization
of the Beta prior for more than 2 variables).
Parameters
----------
partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9
The prior probability of splitting a node's data into two children.
Small values tend to reduce the tree depth, leading to less expressiveness
but also to less overfitting.
Large values tend to increase the tree depth and thus lead to the tree
better fitting the data, which can lead to overfitting.
prior : array_like, shape = [number of classes]
The hyperparameters [alpha_0, alpha_1, ..., alpha_{N-1}] of the Dirichlet
conjugate prior, see [1] and [2]. All alpha_i must be positive, where
alpha_i represents the number of prior pseudo-observations of class i.
Small values for alpha_i represent a weak prior which leads to the
training data dominating the posterior. This can lead to overfitting.
Large values for alpha_i represent a strong prior and thus put less weight
on the data. This can be used for regularization.
delta : float, default=0.0
Determines the strengthening of the prior as the tree grows deeper,
see [1]. Must be a value between 0.0 and 1.0.
prune : boolean, default=False
Prunes the tree after fitting if `True` by removing all splits that don't add information,
i.e., where the predictions of both children are identical. It's usually sensible to set
this to `True` in the classification case if you're only interested in class predictions
(`predict(X)`), but it makes sense to set it to `False` if you're looking for class
probabilities (`predict_proba(X)`). It can safely be set to 'True' in the regression case
because it will only merge children if their predictions are identical.
optimizer : object
A global optimization algorithm object that performs optimal hyperparameter
orientation search. The available options are (in the order in which you should
try them):
- ScipyOptimizer: A wrapper around scipy global optimizers. See usages for examples.
- SimulatedAnnealingOptimizer: Experimental, but works well with n_scan=20, n_keep=10, spread_factor=0.95
- RandomHyperplaneOptimizer: Experimental, mediocre performance
- RandomTwoPointOptimizer: Experimental, mediocre performance
- GradientDescentOptimizer: Experimental, mediocre performance
split_precision : float, default=0.0
Determines the minimum distance between two contiguous points to consider a split. If the distance is below
this threshold, the points are considered to overlap along this direction.
level : DO NOT SET, ONLY USED BY SUBCLASSES
See also
--------
demo_classification_hyperplane.py
HyperplaneRegressionTree
PerpendicularClassificationTree
References
----------
.. [1] https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical/multinomial
.. [2] https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions
Examples
--------
See `demo_classification_perpendicular.py`.
"""
def __init__(self, partition_prior=0.99, prior=None, delta=None, prune=False, optimizer=None, split_precision=0.0, level=0):
child_type = HyperplaneClassificationTree
BaseHyperplaneTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, optimizer, split_precision, level)
BaseClassificationTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level)