Skip to content

Commit

Permalink
Add multi-output support to honest trees (#86)
Browse files Browse the repository at this point in the history
* ENH multi-output support for honest trees/forests

---------

Signed-off-by: Haoyin Xu <haoyinxu@gmail.com>
Co-authored-by: Ronan Perry <13107341+rflperry@users.noreply.github.com>
Co-authored-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
3 people committed Jun 22, 2023
1 parent 7eb96e6 commit d506519
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 78 deletions.
1 change: 1 addition & 0 deletions docs/whats_new/v0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Changelog
- |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
- |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
- |Feature| Implementation of (conditional) mutual information estimation via unsupervised tree models and added NearestNeighborsMetaEstimator by `Adam Li`_ (:pr:`83`)
- |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)


Code and Documentation Contributors
Expand Down
Binary file added examples/overlapping_gaussians.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 22 additions & 17 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,7 @@ def fit(self, X, y, sample_weight=None):
Fitted tree estimator.
"""
super().fit(X, y, sample_weight)
X, y = check_X_y(X, y, multi_output=False)
classes_k, y_encoded = np.unique(y, return_inverse=True)
self.empirical_prior_ = np.bincount(y_encoded, minlength=classes_k.shape[0]) / len(y)
X, y = check_X_y(X, y, multi_output=True)

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
Expand Down Expand Up @@ -440,7 +438,9 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

# avoid storing the output of every tree estimator by summing them here
posteriors = np.zeros((X.shape[0], self.n_classes_), dtype=np.float64)
posteriors = [
np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)
]
lock = threading.Lock()

if indices is None:
Expand All @@ -451,14 +451,23 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
)

# Normalize to unit length, due to prior weighting
zero_mask = posteriors.sum(1) == 0
posteriors = np.array(posteriors)
zero_mask = posteriors.sum(2) == 0
posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)

if impute_missing is None:
posteriors[zero_mask] = self.empirical_prior_
pass
else:
posteriors[zero_mask] = impute_missing

return posteriors
# preserve shape of multi-outputs
if self.n_outputs_ > 1:
posteriors = [post for post in posteriors]

if len(posteriors) == 1:
return posteriors[0]
else:
return posteriors

@property
def structure_indices_(self):
Expand All @@ -485,15 +494,11 @@ def _accumulate_prediction(tree, X, out, lock, indices=None):

if indices is None:
indices = np.arange(X.shape[0])
proba = tree.tree_.predict(X[indices])
proba = proba[:, : tree._tree_n_classes_]
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer

if tree._tree_n_classes_ != tree.n_classes_:
proba = tree._impute_missing_classes(proba)
proba = tree._empty_leaf_correction(proba, normalizer)
proba = tree.predict_proba(X[indices], check_input=False)

with lock:
out[indices] += proba
if len(out) == 1:
out[0][indices] += proba
else:
for i in range(len(out)):
out[i][indices] += proba[i]
75 changes: 51 additions & 24 deletions sktree/tests/test_honest_forest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.estimator_checks import parametrize_with_checks

Expand Down Expand Up @@ -30,6 +30,7 @@ def test_toy_accuracy():

@pytest.mark.parametrize("criterion", ["gini", "entropy"])
@pytest.mark.parametrize("max_features", [None, 2])
@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
@pytest.mark.parametrize(
"estimator",
[
Expand All @@ -38,17 +39,21 @@ def test_toy_accuracy():
PatchObliqueDecisionTreeClassifier(),
],
)
def test_iris(criterion, max_features, estimator):
def test_iris(criterion, max_features, honest_prior, estimator):
# Check consistency on dataset iris.
clf = HonestForestClassifier(
criterion=criterion,
random_state=0,
max_features=max_features,
n_estimators=10,
honest_prior=honest_prior,
tree_estimator=estimator,
)
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
try:
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
except ValueError:
return
assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)
Expand All @@ -59,26 +64,55 @@ def test_iris(criterion, max_features, estimator):
)


def test_impute_classes():
np.random.seed(0)
X = np.random.normal(0, 1, (101, 2))
y = [0] * 50 + [1] * 50 + [2]
clf = HonestForestClassifier(honest_fraction=0.02, random_state=0)
clf = clf.fit(X, y)
@pytest.mark.parametrize("criterion", ["gini", "entropy"])
@pytest.mark.parametrize("max_features", [None, 2])
@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
@pytest.mark.parametrize(
"estimator",
[
DecisionTreeClassifier(),
ObliqueDecisionTreeClassifier(),
PatchObliqueDecisionTreeClassifier(),
],
)
def test_iris_multi(criterion, max_features, honest_prior, estimator):
# Check consistency on dataset iris.
clf = HonestForestClassifier(
criterion=criterion,
random_state=0,
max_features=max_features,
n_estimators=10,
honest_prior=honest_prior,
tree_estimator=estimator,
)

y_proba = clf.predict_proba(X)
second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)])

assert y_proba.shape[1] == 3
X = iris.data
y = np.stack((iris.target, second_y[perm])).T
try:
clf.fit(X, y)
score = r2_score(clf.predict(X), y)
except ValueError:
return
if honest_prior == "ignore":
assert (
score > 0.6 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
else:
assert (
score > 0.9 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)


def test_max_samples():
max_samples_list = [8, 0.5, None]
depths = []
X = np.random.normal(0, 1, (100, 2))
X = rng.normal(0, 1, (100, 2))
X[:50] *= -1
y = [0, 1] * 50
for ms in max_samples_list:
uf = HonestForestClassifier(n_estimators=2, max_samples=ms, bootstrap=True)
uf = HonestForestClassifier(n_estimators=2, random_state=0, max_samples=ms, bootstrap=True)
uf = uf.fit(X, y)
depths.append(uf.estimators_[0].get_depth())

Expand All @@ -94,8 +128,7 @@ def test_max_samples():
],
)
def test_impute_posteriors(honest_prior, val):
np.random.seed(0)
X = np.random.normal(0, 1, (100, 2))
X = rng.normal(0, 1, (100, 2))
y = [0] * 75 + [1] * 25
clf = HonestForestClassifier(
honest_fraction=0.02, random_state=0, honest_prior=honest_prior, n_estimators=2
Expand All @@ -121,8 +154,7 @@ def test_impute_posteriors(honest_prior, val):
],
)
def test_honest_decision_function(honest_fraction, val):
np.random.seed(0)
X = np.random.normal(0, 1, (100, 2))
X = rng.normal(0, 1, (100, 2))
y = [0] * 75 + [1] * 25
clf = HonestForestClassifier(honest_fraction=honest_fraction, random_state=0, n_estimators=2)
clf = clf.fit(X, y)
Expand All @@ -138,17 +170,12 @@ def test_honest_decision_function(honest_fraction, val):
[HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)]
)
def test_sklearn_compatible_estimator(estimator, check):
# 1. multi-output is not supported
# 2. check_class_weight_classifiers is not supported since it requires sample weight
# 1. check_class_weight_classifiers is not supported since it requires sample weight
# XXX: can include this "generalization" in the future if it's useful
# zero sample weight is not "really supported" in honest subsample trees since sample weight
# for fitting the tree's splits
if check.func.__name__ in [
"check_class_weight_classifiers",
"check_classifiers_multilabel_output_format_decision_function",
"check_classifiers_multilabel_output_format_predict_proba",
"check_classifiers_multilabel_output_format_predict",
"check_classifiers_multilabel_representation_invariance",
]:
pytest.skip()
check(estimator)
74 changes: 40 additions & 34 deletions sktree/tree/_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
)
self._inherit_estimator_attributes()

if self.n_outputs_ > 1:
raise NotImplementedError(
"Multi-target honest trees not yet \
implemented"
)

# update the number of classes, unsplit
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
Expand Down Expand Up @@ -419,8 +413,8 @@ def _set_leaf_nodes(self, leaf_ids, y):
classes are ordered by their index in the tree_.value array.
"""
self.tree_.value[:, :, :] = 0
for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, 0]):
self.tree_.value[leaf_id][0, yval] += 1
for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, :]):
self.tree_.value[leaf_id][:, yval] += 1

def _inherit_estimator_attributes(self):
"""Initialize necessary attributes from the provided tree estimator"""
Expand All @@ -431,29 +425,36 @@ def _inherit_estimator_attributes(self):
self.n_outputs_ = self.estimator_.n_outputs_
self.tree_ = self.estimator_.tree_

def _empty_leaf_correction(self, proba, normalizer):
"""Leaves with empty posteriors are assigned values"""
def _empty_leaf_correction(self, proba, pos=0):
"""Leaves with empty posteriors are assigned values.
The posteriors are corrected according to the honest prior.
In multi-output cases, the posterior corrections only correspond
to the respective y dimension, indicated by the position param pos.
"""
zero_mask = proba.sum(axis=1) == 0.0
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")

# For multi-output cases
if self.n_outputs_ > 1:
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_[pos]
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_[pos]
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
else:
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
return proba

def _impute_missing_classes(self, proba):
"""Due to splitting, provide proba outputs for some classes"""
new_proba = np.zeros((proba.shape[0], self.n_classes_))
for i, old_class in enumerate(self._tree_classes_):
j = np.where(self.classes_ == old_class)[0][0]
new_proba[:, j] = proba[:, i]

return new_proba

def predict_proba(self, X, check_input=True):
"""Predict class probabilities of the input samples X.
Expand Down Expand Up @@ -487,17 +488,22 @@ class in a leaf.
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer
if self._tree_n_classes_ != self.n_classes_:
proba = self._impute_missing_classes(proba)
proba = self._empty_leaf_correction(proba, normalizer)
proba = self._empty_leaf_correction(proba)

return proba

else:
raise NotImplementedError(
"Multi-target honest trees not yet \
implemented"
)
all_proba = []

for k in range(self.n_outputs_):
proba_k = proba[:, k, : self._tree_n_classes_[k]]
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
proba_k = self._empty_leaf_correction(proba_k, k)
all_proba.append(proba_k)

return all_proba

def predict(self, X, check_input=True):
"""Predict class for X.
Expand Down
3 changes: 0 additions & 3 deletions sktree/tree/tests/test_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,4 @@ def test_impute_classes():

@parametrize_with_checks([HonestTreeClassifier(random_state=0)])
def test_sklearn_compatible_estimator(estimator, check):
# TODO: remove when we implement Regressor classes
# if TREE_ESTIMATORS[estimator].__name__ in TREE_CLASSIFIERS:
# pytest.skip()
check(estimator)

0 comments on commit d506519

Please sign in to comment.