Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multi-output support to honest trees #86

Merged
merged 20 commits into from
Jun 22, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/whats_new/v0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Changelog
- |Feature| A general-kernel MORF is now implemented where users can pass in a kernel library, by `Adam Li`_ (:pr:`70`)
- |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
- |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
- |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)

Code and Documentation Contributors
-----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion sktree/_lib/sklearn_fork
Submodule sklearn_fork updated 156 files
39 changes: 22 additions & 17 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,7 @@ def fit(self, X, y, sample_weight=None):
Fitted tree estimator.
"""
super().fit(X, y, sample_weight)
X, y = check_X_y(X, y, multi_output=False)
classes_k, y_encoded = np.unique(y, return_inverse=True)
self.empirical_prior_ = np.bincount(y_encoded, minlength=classes_k.shape[0]) / len(y)
X, y = check_X_y(X, y, multi_output=True)

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
Expand Down Expand Up @@ -440,7 +438,9 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

# avoid storing the output of every tree estimator by summing them here
posteriors = np.zeros((X.shape[0], self.n_classes_), dtype=np.float64)
posteriors = [
np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)
]
lock = threading.Lock()

if indices is None:
Expand All @@ -451,14 +451,23 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
)

# Normalize to unit length, due to prior weighting
zero_mask = posteriors.sum(1) == 0
posteriors = np.array(posteriors)
zero_mask = posteriors.sum(2) == 0
posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)

if impute_missing is None:
posteriors[zero_mask] = self.empirical_prior_
pass
else:
posteriors[zero_mask] = impute_missing

return posteriors
# preserve shape of multi-outputs
if self.n_outputs_ > 1:
posteriors = [post for post in posteriors]

if len(posteriors) == 1:
return posteriors[0]
else:
return posteriors

@property
def structure_indices_(self):
Expand All @@ -485,15 +494,11 @@ def _accumulate_prediction(tree, X, out, lock, indices=None):

if indices is None:
indices = np.arange(X.shape[0])
proba = tree.tree_.predict(X[indices])
proba = proba[:, : tree._tree_n_classes_]
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer

if tree._tree_n_classes_ != tree.n_classes_:
proba = tree._impute_missing_classes(proba)
proba = tree._empty_leaf_correction(proba, normalizer)
proba = tree.predict_proba(X[indices], check_input=False)

with lock:
out[indices] += proba
if len(out) == 1:
out[0][indices] += proba
else:
for i in range(len(out)):
out[i][indices] += proba[i]
40 changes: 33 additions & 7 deletions sktree/tests/test_honest_forest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.estimator_checks import parametrize_with_checks

Expand Down Expand Up @@ -59,6 +59,37 @@ def test_iris(criterion, max_features, estimator):
)


@pytest.mark.parametrize("criterion", ["gini", "entropy"])
@pytest.mark.parametrize("max_features", [None, 2])
@pytest.mark.parametrize(
"estimator",
[
DecisionTreeClassifier(),
ObliqueDecisionTreeClassifier(),
PatchObliqueDecisionTreeClassifier(),
],
)
def test_iris_multi(criterion, max_features, estimator):
# Check consistency on dataset iris.
clf = HonestForestClassifier(
criterion=criterion,
random_state=0,
max_features=max_features,
n_estimators=10,
tree_estimator=estimator,
)

second_y = np.concatenate([(np.ones(10) * 3), (np.ones(20) * 4), (np.ones(120) * 5)])

X = iris.data
y = np.stack((iris.target, second_y)).T
clf.fit(X, y)
score = mean_squared_error(clf.predict(X), y)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just use https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

and assert some non-trivial performance?

Is it possible to interpret MSE of 0.5-1.0?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears that accuracy_score doesn't support multi-output. It seems that the default way they measure such predictions is through sklearn.multioutput.MultiOutputClassifier.

assert score < 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)


def test_impute_classes():
np.random.seed(0)
X = np.random.normal(0, 1, (101, 2))
Expand Down Expand Up @@ -138,17 +169,12 @@ def test_honest_decision_function(honest_fraction, val):
[HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)]
)
def test_sklearn_compatible_estimator(estimator, check):
# 1. multi-output is not supported
# 2. check_class_weight_classifiers is not supported since it requires sample weight
# 1. check_class_weight_classifiers is not supported since it requires sample weight
# XXX: can include this "generalization" in the future if it's useful
# zero sample weight is not "really supported" in honest subsample trees since sample weight
# for fitting the tree's splits
if check.func.__name__ in [
"check_class_weight_classifiers",
"check_classifiers_multilabel_output_format_decision_function",
"check_classifiers_multilabel_output_format_predict_proba",
"check_classifiers_multilabel_output_format_predict",
"check_classifiers_multilabel_representation_invariance",
]:
pytest.skip()
check(estimator)
54 changes: 34 additions & 20 deletions sktree/tree/_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
)
self._inherit_estimator_attributes()

if self.n_outputs_ > 1:
raise NotImplementedError(
"Multi-target honest trees not yet \
implemented"
)

# update the number of classes, unsplit
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
Expand Down Expand Up @@ -423,18 +417,29 @@ def _inherit_estimator_attributes(self):
self.n_outputs_ = self.estimator_.n_outputs_
self.tree_ = self.estimator_.tree_

def _empty_leaf_correction(self, proba, normalizer):
def _empty_leaf_correction(self, proba, pos=0):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just add a short docstring to describe what's going on? I'm reading these lines and having trouble figuring out what pos does actually :/

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pos indicates the class dimension of y, so that posterior corrections only work on that dimension in multi-output cases.

"""Leaves with empty posteriors are assigned values"""
zero_mask = proba.sum(axis=1) == 0.0
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")

# For multi-output cases
if self.n_outputs_ > 1:
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_[pos]
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_[pos]
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
else:
if self.honest_prior == "empirical":
proba[zero_mask] = self.empirical_prior_
elif self.honest_prior == "uniform":
proba[zero_mask] = 1 / self.n_classes_
elif self.honest_prior == "ignore":
proba[zero_mask] = np.nan
else:
raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
return proba

def _impute_missing_classes(self, proba):
Expand Down Expand Up @@ -481,15 +486,24 @@ class in a leaf.
proba /= normalizer
if self._tree_n_classes_ != self.n_classes_:
proba = self._impute_missing_classes(proba)
proba = self._empty_leaf_correction(proba, normalizer)
proba = self._empty_leaf_correction(proba)

return proba

else:
raise NotImplementedError(
"Multi-target honest trees not yet \
implemented"
)
all_proba = []

for k in range(self.n_outputs_):
proba_k = proba[:, k, : self._tree_n_classes_[k]]
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
if self._tree_n_classes_[k] != self.n_classes_[k]:
proba_k = self._impute_missing_classes(proba_k)
proba_k = self._empty_leaf_correction(proba_k, k)
all_proba.append(proba_k)

return all_proba

def predict(self, X, check_input=True):
"""Predict class for X.
Expand Down
3 changes: 0 additions & 3 deletions sktree/tree/tests/test_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,4 @@ def test_impute_classes():

@parametrize_with_checks([HonestTreeClassifier(random_state=0)])
def test_sklearn_compatible_estimator(estimator, check):
# TODO: remove when we implement Regressor classes
# if TREE_ESTIMATORS[estimator].__name__ in TREE_CLASSIFIERS:
# pytest.skip()
check(estimator)