Add multi-output support to honest trees (#86)

* ENH multi-output support for honest trees/forests --------- Signed-off-by: Haoyin Xu <haoyinxu@gmail.com> Co-authored-by: Ronan Perry <13107341+rflperry@users.noreply.github.com> Co-authored-by: Adam Li <adam2392@gmail.com>
neurodata · Jun 22, 2023 · d506519 · d506519
1 parent 7eb96e6
commit d506519
Show file tree

Hide file tree

Showing 6 changed files with 114 additions and 78 deletions.
diff --git a/docs/whats_new/v0.1.rst b/docs/whats_new/v0.1.rst
@@ -37,6 +37,7 @@ Changelog
 - |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
 - |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
 - |Feature| Implementation of (conditional) mutual information estimation via unsupervised tree models and added NearestNeighborsMetaEstimator by `Adam Li`_ (:pr:`83`)
+- |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)
 
 
 Code and Documentation Contributors

diff --git a/examples/overlapping_gaussians.png b/examples/overlapping_gaussians.png
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
@@ -400,9 +400,7 @@ def fit(self, X, y, sample_weight=None):
             Fitted tree estimator.
         """
         super().fit(X, y, sample_weight)
-        X, y = check_X_y(X, y, multi_output=False)
-        classes_k, y_encoded = np.unique(y, return_inverse=True)
-        self.empirical_prior_ = np.bincount(y_encoded, minlength=classes_k.shape[0]) / len(y)
+        X, y = check_X_y(X, y, multi_output=True)
 
         # Compute honest decision function
         self.honest_decision_function_ = self._predict_proba(
@@ -440,7 +438,9 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
         # avoid storing the output of every tree estimator by summing them here
-        posteriors = np.zeros((X.shape[0], self.n_classes_), dtype=np.float64)
+        posteriors = [
+            np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)
+        ]
         lock = threading.Lock()
 
         if indices is None:
@@ -451,14 +451,23 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
         )
 
         # Normalize to unit length, due to prior weighting
-        zero_mask = posteriors.sum(1) == 0
+        posteriors = np.array(posteriors)
+        zero_mask = posteriors.sum(2) == 0
         posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)
+
         if impute_missing is None:
-            posteriors[zero_mask] = self.empirical_prior_
+            pass
         else:
             posteriors[zero_mask] = impute_missing
 
-        return posteriors
+        # preserve shape of multi-outputs
+        if self.n_outputs_ > 1:
+            posteriors = [post for post in posteriors]
+
+        if len(posteriors) == 1:
+            return posteriors[0]
+        else:
+            return posteriors
 
     @property
     def structure_indices_(self):
@@ -485,15 +494,11 @@ def _accumulate_prediction(tree, X, out, lock, indices=None):
 
     if indices is None:
         indices = np.arange(X.shape[0])
-    proba = tree.tree_.predict(X[indices])
-    proba = proba[:, : tree._tree_n_classes_]
-    normalizer = proba.sum(axis=1)[:, np.newaxis]
-    normalizer[normalizer == 0.0] = 1.0
-    proba /= normalizer
-
-    if tree._tree_n_classes_ != tree.n_classes_:
-        proba = tree._impute_missing_classes(proba)
-    proba = tree._empty_leaf_correction(proba, normalizer)
+    proba = tree.predict_proba(X[indices], check_input=False)
 
     with lock:
-        out[indices] += proba
+        if len(out) == 1:
+            out[0][indices] += proba
+        else:
+            for i in range(len(out)):
+                out[i][indices] += proba[i]
diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 from sklearn import datasets
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, r2_score
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
@@ -30,6 +30,7 @@ def test_toy_accuracy():
 
 @pytest.mark.parametrize("criterion", ["gini", "entropy"])
 @pytest.mark.parametrize("max_features", [None, 2])
+@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
 @pytest.mark.parametrize(
     "estimator",
     [
@@ -38,17 +39,21 @@ def test_toy_accuracy():
         PatchObliqueDecisionTreeClassifier(),
     ],
 )
-def test_iris(criterion, max_features, estimator):
+def test_iris(criterion, max_features, honest_prior, estimator):
     # Check consistency on dataset iris.
     clf = HonestForestClassifier(
         criterion=criterion,
         random_state=0,
         max_features=max_features,
         n_estimators=10,
+        honest_prior=honest_prior,
         tree_estimator=estimator,
     )
-    clf.fit(iris.data, iris.target)
-    score = accuracy_score(clf.predict(iris.data), iris.target)
+    try:
+        clf.fit(iris.data, iris.target)
+        score = accuracy_score(clf.predict(iris.data), iris.target)
+    except ValueError:
+        return
     assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
         "HForest", criterion, score
     )
@@ -59,26 +64,55 @@ def test_iris(criterion, max_features, estimator):
     )
 
 
-def test_impute_classes():
-    np.random.seed(0)
-    X = np.random.normal(0, 1, (101, 2))
-    y = [0] * 50 + [1] * 50 + [2]
-    clf = HonestForestClassifier(honest_fraction=0.02, random_state=0)
-    clf = clf.fit(X, y)
+@pytest.mark.parametrize("criterion", ["gini", "entropy"])
+@pytest.mark.parametrize("max_features", [None, 2])
+@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        DecisionTreeClassifier(),
+        ObliqueDecisionTreeClassifier(),
+        PatchObliqueDecisionTreeClassifier(),
+    ],
+)
+def test_iris_multi(criterion, max_features, honest_prior, estimator):
+    # Check consistency on dataset iris.
+    clf = HonestForestClassifier(
+        criterion=criterion,
+        random_state=0,
+        max_features=max_features,
+        n_estimators=10,
+        honest_prior=honest_prior,
+        tree_estimator=estimator,
+    )
 
-    y_proba = clf.predict_proba(X)
+    second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)])
 
-    assert y_proba.shape[1] == 3
+    X = iris.data
+    y = np.stack((iris.target, second_y[perm])).T
+    try:
+        clf.fit(X, y)
+        score = r2_score(clf.predict(X), y)
+    except ValueError:
+        return
+    if honest_prior == "ignore":
+        assert (
+            score > 0.6 and score < 1.0
+        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
+    else:
+        assert (
+            score > 0.9 and score < 1.0
+        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
 
 
 def test_max_samples():
     max_samples_list = [8, 0.5, None]
     depths = []
-    X = np.random.normal(0, 1, (100, 2))
+    X = rng.normal(0, 1, (100, 2))
     X[:50] *= -1
     y = [0, 1] * 50
     for ms in max_samples_list:
-        uf = HonestForestClassifier(n_estimators=2, max_samples=ms, bootstrap=True)
+        uf = HonestForestClassifier(n_estimators=2, random_state=0, max_samples=ms, bootstrap=True)
         uf = uf.fit(X, y)
         depths.append(uf.estimators_[0].get_depth())
 
@@ -94,8 +128,7 @@ def test_max_samples():
     ],
 )
 def test_impute_posteriors(honest_prior, val):
-    np.random.seed(0)
-    X = np.random.normal(0, 1, (100, 2))
+    X = rng.normal(0, 1, (100, 2))
     y = [0] * 75 + [1] * 25
     clf = HonestForestClassifier(
         honest_fraction=0.02, random_state=0, honest_prior=honest_prior, n_estimators=2
@@ -121,8 +154,7 @@ def test_impute_posteriors(honest_prior, val):
     ],
 )
 def test_honest_decision_function(honest_fraction, val):
-    np.random.seed(0)
-    X = np.random.normal(0, 1, (100, 2))
+    X = rng.normal(0, 1, (100, 2))
     y = [0] * 75 + [1] * 25
     clf = HonestForestClassifier(honest_fraction=honest_fraction, random_state=0, n_estimators=2)
     clf = clf.fit(X, y)
@@ -138,17 +170,12 @@ def test_honest_decision_function(honest_fraction, val):
     [HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)]
 )
 def test_sklearn_compatible_estimator(estimator, check):
-    # 1. multi-output is not supported
-    # 2. check_class_weight_classifiers is not supported since it requires sample weight
+    # 1. check_class_weight_classifiers is not supported since it requires sample weight
     # XXX: can include this "generalization" in the future if it's useful
     #  zero sample weight is not "really supported" in honest subsample trees since sample weight
     #  for fitting the tree's splits
     if check.func.__name__ in [
         "check_class_weight_classifiers",
-        "check_classifiers_multilabel_output_format_decision_function",
-        "check_classifiers_multilabel_output_format_predict_proba",
-        "check_classifiers_multilabel_output_format_predict",
-        "check_classifiers_multilabel_representation_invariance",
     ]:
         pytest.skip()
     check(estimator)
diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py
@@ -360,12 +360,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         )
         self._inherit_estimator_attributes()
 
-        if self.n_outputs_ > 1:
-            raise NotImplementedError(
-                "Multi-target honest trees not yet \
-                implemented"
-            )
-
         # update the number of classes, unsplit
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
@@ -419,8 +413,8 @@ def _set_leaf_nodes(self, leaf_ids, y):
         classes are ordered by their index in the tree_.value array.
         """
         self.tree_.value[:, :, :] = 0
-        for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, 0]):
-            self.tree_.value[leaf_id][0, yval] += 1
+        for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, :]):
+            self.tree_.value[leaf_id][:, yval] += 1
 
     def _inherit_estimator_attributes(self):
         """Initialize necessary attributes from the provided tree estimator"""
@@ -431,29 +425,36 @@ def _inherit_estimator_attributes(self):
         self.n_outputs_ = self.estimator_.n_outputs_
         self.tree_ = self.estimator_.tree_
 
-    def _empty_leaf_correction(self, proba, normalizer):
-        """Leaves with empty posteriors are assigned values"""
+    def _empty_leaf_correction(self, proba, pos=0):
+        """Leaves with empty posteriors are assigned values.
+
+        The posteriors are corrected according to the honest prior.
+        In multi-output cases, the posterior corrections only correspond
+        to the respective y dimension, indicated by the position param pos.
+        """
         zero_mask = proba.sum(axis=1) == 0.0
-        if self.honest_prior == "empirical":
-            proba[zero_mask] = self.empirical_prior_
-        elif self.honest_prior == "uniform":
-            proba[zero_mask] = 1 / self.n_classes_
-        elif self.honest_prior == "ignore":
-            proba[zero_mask] = np.nan
-        else:
-            raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
 
+        # For multi-output cases
+        if self.n_outputs_ > 1:
+            if self.honest_prior == "empirical":
+                proba[zero_mask] = self.empirical_prior_[pos]
+            elif self.honest_prior == "uniform":
+                proba[zero_mask] = 1 / self.n_classes_[pos]
+            elif self.honest_prior == "ignore":
+                proba[zero_mask] = np.nan
+            else:
+                raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
+        else:
+            if self.honest_prior == "empirical":
+                proba[zero_mask] = self.empirical_prior_
+            elif self.honest_prior == "uniform":
+                proba[zero_mask] = 1 / self.n_classes_
+            elif self.honest_prior == "ignore":
+                proba[zero_mask] = np.nan
+            else:
+                raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
         return proba
 
-    def _impute_missing_classes(self, proba):
-        """Due to splitting, provide proba outputs for some classes"""
-        new_proba = np.zeros((proba.shape[0], self.n_classes_))
-        for i, old_class in enumerate(self._tree_classes_):
-            j = np.where(self.classes_ == old_class)[0][0]
-            new_proba[:, j] = proba[:, i]
-
-        return new_proba
-
     def predict_proba(self, X, check_input=True):
         """Predict class probabilities of the input samples X.
 
@@ -487,17 +488,22 @@ class in a leaf.
             normalizer = proba.sum(axis=1)[:, np.newaxis]
             normalizer[normalizer == 0.0] = 1.0
             proba /= normalizer
-            if self._tree_n_classes_ != self.n_classes_:
-                proba = self._impute_missing_classes(proba)
-            proba = self._empty_leaf_correction(proba, normalizer)
+            proba = self._empty_leaf_correction(proba)
 
             return proba
 
         else:
-            raise NotImplementedError(
-                "Multi-target honest trees not yet \
-                implemented"
-            )
+            all_proba = []
+
+            for k in range(self.n_outputs_):
+                proba_k = proba[:, k, : self._tree_n_classes_[k]]
+                normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+                normalizer[normalizer == 0.0] = 1.0
+                proba_k /= normalizer
+                proba_k = self._empty_leaf_correction(proba_k, k)
+                all_proba.append(proba_k)
+
+            return all_proba
 
     def predict(self, X, check_input=True):
         """Predict class for X.

diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py
@@ -106,7 +106,4 @@ def test_impute_classes():
 
 @parametrize_with_checks([HonestTreeClassifier(random_state=0)])
 def test_sklearn_compatible_estimator(estimator, check):
-    # TODO: remove when we implement Regressor classes
-    # if TREE_ESTIMATORS[estimator].__name__ in TREE_CLASSIFIERS:
-    #     pytest.skip()
     check(estimator)