Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multi-output support to honest trees #86

Merged
merged 20 commits into from
Jun 22, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/whats_new/v0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Changelog
- |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
- |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
- |Feature| Implementation of (conditional) mutual information estimation via unsupervised tree models and added NearestNeighborsMetaEstimator by `Adam Li`_ (:pr:`83`)
- |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)


Code and Documentation Contributors
Expand Down
Binary file added examples/overlapping_gaussians.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 22 additions & 17 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,7 @@ def fit(self, X, y, sample_weight=None):
Fitted tree estimator.
"""
super().fit(X, y, sample_weight)
X, y = check_X_y(X, y, multi_output=False)
classes_k, y_encoded = np.unique(y, return_inverse=True)
self.empirical_prior_ = np.bincount(y_encoded, minlength=classes_k.shape[0]) / len(y)
X, y = check_X_y(X, y, multi_output=True)

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
Expand Down Expand Up @@ -440,7 +438,9 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

# avoid storing the output of every tree estimator by summing them here
posteriors = np.zeros((X.shape[0], self.n_classes_), dtype=np.float64)
posteriors = [
np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)
]
lock = threading.Lock()

if indices is None:
Expand All @@ -451,14 +451,23 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
)

# Normalize to unit length, due to prior weighting
zero_mask = posteriors.sum(1) == 0
posteriors = np.array(posteriors)
zero_mask = posteriors.sum(2) == 0
posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)

if impute_missing is None:
posteriors[zero_mask] = self.empirical_prior_
pass
else:
posteriors[zero_mask] = impute_missing

return posteriors
# preserve shape of multi-outputs
if self.n_outputs_ > 1:
posteriors = [post for post in posteriors]

if len(posteriors) == 1:
return posteriors[0]
else:
return posteriors

@property
def structure_indices_(self):
Expand All @@ -485,15 +494,11 @@ def _accumulate_prediction(tree, X, out, lock, indices=None):

if indices is None:
indices = np.arange(X.shape[0])
proba = tree.tree_.predict(X[indices])
proba = proba[:, : tree._tree_n_classes_]
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer

if tree._tree_n_classes_ != tree.n_classes_:
proba = tree._impute_missing_classes(proba)
proba = tree._empty_leaf_correction(proba, normalizer)
proba = tree.predict_proba(X[indices], check_input=False)

with lock:
out[indices] += proba
if len(out) == 1:
out[0][indices] += proba
else:
for i in range(len(out)):
out[i][indices] += proba[i]
68 changes: 49 additions & 19 deletions sktree/tests/test_honest_forest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.estimator_checks import parametrize_with_checks

Expand Down Expand Up @@ -30,6 +30,7 @@ def test_toy_accuracy():

@pytest.mark.parametrize("criterion", ["gini", "entropy"])
@pytest.mark.parametrize("max_features", [None, 2])
@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
@pytest.mark.parametrize(
"estimator",
[
Expand All @@ -38,17 +39,21 @@ def test_toy_accuracy():
PatchObliqueDecisionTreeClassifier(),
],
)
def test_iris(criterion, max_features, estimator):
def test_iris(criterion, max_features, honest_prior, estimator):
# Check consistency on dataset iris.
clf = HonestForestClassifier(
criterion=criterion,
random_state=0,
max_features=max_features,
n_estimators=10,
honest_prior=honest_prior,
tree_estimator=estimator,
)
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
try:
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
except ValueError:
return
assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)
Expand All @@ -59,26 +64,56 @@ def test_iris(criterion, max_features, estimator):
)


def test_impute_classes():
np.random.seed(0)
X = np.random.normal(0, 1, (101, 2))
y = [0] * 50 + [1] * 50 + [2]
clf = HonestForestClassifier(honest_fraction=0.02, random_state=0)
clf = clf.fit(X, y)
@pytest.mark.parametrize("criterion", ["gini", "entropy"])
@pytest.mark.parametrize("max_features", [None, 2])
@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
@pytest.mark.parametrize(
"estimator",
[
DecisionTreeClassifier(),
ObliqueDecisionTreeClassifier(),
PatchObliqueDecisionTreeClassifier(),
],
)
def test_iris_multi(criterion, max_features, honest_prior, estimator):
# Check consistency on dataset iris.
clf = HonestForestClassifier(
criterion=criterion,
random_state=0,
max_features=max_features,
n_estimators=10,
honest_prior=honest_prior,
tree_estimator=estimator,
)

y_proba = clf.predict_proba(X)
second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)])

assert y_proba.shape[1] == 3
X = iris.data
y = np.stack((iris.target, second_y[perm])).T
try:
clf.fit(X, y)
score = r2_score(clf.predict(X), y)
except ValueError:
return
if honest_prior == "ignore":
assert (
score > 0.6 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
else:
assert (
score > 0.9 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)


def test_max_samples():
max_samples_list = [8, 0.5, None]
depths = []
np.random.seed(0)
Copy link
Collaborator

@adam2392 adam2392 Jun 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't set global random seed. I just realized a lot of tests are doing this. This actually isn't thread safe which is an issue for Cythonized code.

Instead you can set a global seed = 12345 and then for each place-in for np.random., you run rng = np.random.default_rng(seed) and use rng in place of np.random

Can you do this everywhere in the file (I think just 8 places that uses np.random.seed)?

Here's a ref talking about it: https://albertcthomas.github.io/good-practices-random-number-generators/

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do.

X = np.random.normal(0, 1, (100, 2))
X[:50] *= -1
y = [0, 1] * 50
for ms in max_samples_list:
uf = HonestForestClassifier(n_estimators=2, max_samples=ms, bootstrap=True)
uf = HonestForestClassifier(n_estimators=2, random_state=0, max_samples=ms, bootstrap=True)
uf = uf.fit(X, y)
depths.append(uf.estimators_[0].get_depth())

Expand Down Expand Up @@ -138,17 +173,12 @@ def test_honest_decision_function(honest_fraction, val):
[HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)]
)
def test_sklearn_compatible_estimator(estimator, check):
# 1. multi-output is not supported
# 2. check_class_weight_classifiers is not supported since it requires sample weight
# 1. check_class_weight_classifiers is not supported since it requires sample weight
# XXX: can include this "generalization" in the future if it's useful
# zero sample weight is not "really supported" in honest subsample trees since sample weight
# for fitting the tree's splits
if check.func.__name__ in [
"check_class_weight_classifiers",
"check_classifiers_multilabel_output_format_decision_function",
"check_classifiers_multilabel_output_format_predict_proba",
"check_classifiers_multilabel_output_format_predict",
"check_classifiers_multilabel_representation_invariance",
]:
pytest.skip()
check(estimator)
74 changes: 40 additions & 34 deletions sktree/tree/_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
)
self._inherit_estimator_attributes()

if self.n_outputs_ > 1:
raise NotImplementedError(
"Multi-target honest trees not yet \
implemented"
)

# update the number of classes, unsplit
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
Expand Down Expand Up @@ -419,8 +413,8 @@ def _set_leaf_nodes(self, leaf_ids, y):
classes are ordered by their index in the tree_.value array.
"""
self.tree_.value[:, :, :] = 0
for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, 0]):
self.tree_.value[leaf_id][0, yval] += 1
for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, :]):
self.tree_.value[leaf_id][:, yval] += 1

def _inherit_estimator_attributes(self):
"""Initialize necessary attributes from the provided tree estimator"""
Expand All @@ -431,29 +425,36 @@ def _inherit_estimator_attributes(self):
self.n_outputs_ = self.estimator_.n_outputs_
self.tree_ = self.estimator_.tree_

def _empty_leaf_correction(self, proba, normalizer):
"""Leaves with empty posteriors are assigned values"""
def _empty_leaf_correction(self, proba, pos=0):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just add a short docstring to describe what's going on? I'm reading these lines and having trouble figuring out what pos does actually :/

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pos indicates the class dimension of y, so that posterior corrections only work on that dimension in multi-output cases.

"""Leaves with empty posteriors are assigned values.

The posteriors are corrected according to the honest prior.
In multi-output cases, the posterior corrections only correspond
to the respective y dimension, indicated by the position param pos.
"""
zero_mask = proba.sum(axis=1) == 0.0
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")

# For multi-output cases
if self.n_outputs_ > 1:
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_[pos]
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_[pos]
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
else:
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
return proba

def _impute_missing_classes(self, proba):
"""Due to splitting, provide proba outputs for some classes"""
new_proba = np.zeros((proba.shape[0], self.n_classes_))
for i, old_class in enumerate(self._tree_classes_):
j = np.where(self.classes_ == old_class)[0][0]
new_proba[:, j] = proba[:, i]

return new_proba

def predict_proba(self, X, check_input=True):
"""Predict class probabilities of the input samples X.

Expand Down Expand Up @@ -487,17 +488,22 @@ class in a leaf.
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer
if self._tree_n_classes_ != self.n_classes_:
proba = self._impute_missing_classes(proba)
proba = self._empty_leaf_correction(proba, normalizer)
proba = self._empty_leaf_correction(proba)

return proba

else:
raise NotImplementedError(
"Multi-target honest trees not yet \
implemented"
)
all_proba = []

for k in range(self.n_outputs_):
proba_k = proba[:, k, : self._tree_n_classes_[k]]
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
proba_k = self._empty_leaf_correction(proba_k, k)
all_proba.append(proba_k)

return all_proba

def predict(self, X, check_input=True):
"""Predict class for X.
Expand Down
3 changes: 0 additions & 3 deletions sktree/tree/tests/test_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,4 @@ def test_impute_classes():

@parametrize_with_checks([HonestTreeClassifier(random_state=0)])
def test_sklearn_compatible_estimator(estimator, check):
# TODO: remove when we implement Regressor classes
# if TREE_ESTIMATORS[estimator].__name__ in TREE_CLASSIFIERS:
# pytest.skip()
check(estimator)