neurodata · adam2392 · Jun 22, 2023 · Jun 14, 2023 · Jun 14, 2023 · Jun 14, 2023
diff --git a/docs/whats_new/v0.1.rst b/docs/whats_new/v0.1.rst
@@ -36,6 +36,7 @@ Changelog
 - |Feature| A general-kernel MORF is now implemented where users can pass in a kernel library, by `Adam Li`_ (:pr:`70`)
 - |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
 - |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
+- |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
@@ -400,9 +400,7 @@ def fit(self, X, y, sample_weight=None):
             Fitted tree estimator.
         """
         super().fit(X, y, sample_weight)
-        X, y = check_X_y(X, y, multi_output=False)
-        classes_k, y_encoded = np.unique(y, return_inverse=True)
-        self.empirical_prior_ = np.bincount(y_encoded, minlength=classes_k.shape[0]) / len(y)
+        X, y = check_X_y(X, y, multi_output=True)
 
         # Compute honest decision function
         self.honest_decision_function_ = self._predict_proba(
@@ -440,7 +438,9 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
         # avoid storing the output of every tree estimator by summing them here
-        posteriors = np.zeros((X.shape[0], self.n_classes_), dtype=np.float64)
+        posteriors = [
+            np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)
+        ]
         lock = threading.Lock()
 
         if indices is None:
@@ -451,14 +451,23 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
         )
 
         # Normalize to unit length, due to prior weighting
-        zero_mask = posteriors.sum(1) == 0
+        posteriors = np.array(posteriors)
+        zero_mask = posteriors.sum(2) == 0
         posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)
+
         if impute_missing is None:
-            posteriors[zero_mask] = self.empirical_prior_
+            pass
         else:
             posteriors[zero_mask] = impute_missing
 
-        return posteriors
+        # preserve shape of multi-outputs
+        if self.n_outputs_ > 1:
+            posteriors = [post for post in posteriors]
+
+        if len(posteriors) == 1:
+            return posteriors[0]
+        else:
+            return posteriors
 
     @property
     def structure_indices_(self):
@@ -485,15 +494,11 @@ def _accumulate_prediction(tree, X, out, lock, indices=None):
 
     if indices is None:
         indices = np.arange(X.shape[0])
-    proba = tree.tree_.predict(X[indices])
-    proba = proba[:, : tree._tree_n_classes_]
-    normalizer = proba.sum(axis=1)[:, np.newaxis]
-    normalizer[normalizer == 0.0] = 1.0
-    proba /= normalizer
-
-    if tree._tree_n_classes_ != tree.n_classes_:
-        proba = tree._impute_missing_classes(proba)
-    proba = tree._empty_leaf_correction(proba, normalizer)
+    proba = tree.predict_proba(X[indices], check_input=False)
 
     with lock:
-        out[indices] += proba
+        if len(out) == 1:
+            out[0][indices] += proba
+        else:
+            for i in range(len(out)):
+                out[i][indices] += proba[i]
diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 from sklearn import datasets
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, mean_squared_error
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
@@ -59,6 +59,37 @@ def test_iris(criterion, max_features, estimator):
     )
 
 
+@pytest.mark.parametrize("criterion", ["gini", "entropy"])
+@pytest.mark.parametrize("max_features", [None, 2])
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        DecisionTreeClassifier(),
+        ObliqueDecisionTreeClassifier(),
+        PatchObliqueDecisionTreeClassifier(),
+    ],
+)
+def test_iris_multi(criterion, max_features, estimator):
+    # Check consistency on dataset iris.
+    clf = HonestForestClassifier(
+        criterion=criterion,
+        random_state=0,
+        max_features=max_features,
+        n_estimators=10,
+        tree_estimator=estimator,
+    )
+
+    second_y = np.concatenate([(np.ones(10) * 3), (np.ones(20) * 4), (np.ones(120) * 5)])
+
+    X = iris.data
+    y = np.stack((iris.target, second_y)).T
+    clf.fit(X, y)
+    score = mean_squared_error(clf.predict(X), y)
+    assert score < 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
+        "HForest", criterion, score
+    )
+
+
 def test_impute_classes():
     np.random.seed(0)
     X = np.random.normal(0, 1, (101, 2))
@@ -138,17 +169,12 @@ def test_honest_decision_function(honest_fraction, val):
     [HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)]
 )
 def test_sklearn_compatible_estimator(estimator, check):
-    # 1. multi-output is not supported
-    # 2. check_class_weight_classifiers is not supported since it requires sample weight
+    # 1. check_class_weight_classifiers is not supported since it requires sample weight
     # XXX: can include this "generalization" in the future if it's useful
     #  zero sample weight is not "really supported" in honest subsample trees since sample weight
     #  for fitting the tree's splits
     if check.func.__name__ in [
         "check_class_weight_classifiers",
-        "check_classifiers_multilabel_output_format_decision_function",
-        "check_classifiers_multilabel_output_format_predict_proba",
-        "check_classifiers_multilabel_output_format_predict",
-        "check_classifiers_multilabel_representation_invariance",
     ]:
         pytest.skip()
     check(estimator)
diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py
@@ -360,12 +360,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         )
         self._inherit_estimator_attributes()
 
-        if self.n_outputs_ > 1:
-            raise NotImplementedError(
-                "Multi-target honest trees not yet \
-                implemented"
-            )
-
         # update the number of classes, unsplit
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
@@ -423,18 +417,29 @@ def _inherit_estimator_attributes(self):
         self.n_outputs_ = self.estimator_.n_outputs_
         self.tree_ = self.estimator_.tree_
 
-    def _empty_leaf_correction(self, proba, normalizer):
+    def _empty_leaf_correction(self, proba, pos=0):
         """Leaves with empty posteriors are assigned values"""
         zero_mask = proba.sum(axis=1) == 0.0
-        if self.honest_prior == "empirical":
-            proba[zero_mask] = self.empirical_prior_
-        elif self.honest_prior == "uniform":
-            proba[zero_mask] = 1 / self.n_classes_
-        elif self.honest_prior == "ignore":
-            proba[zero_mask] = np.nan
-        else:
-            raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
 
+        # For multi-output cases
+        if self.n_outputs_ > 1:
+            if self.honest_prior == "empirical":
+                proba[zero_mask] = self.empirical_prior_[pos]
+            elif self.honest_prior == "uniform":
+                proba[zero_mask] = 1 / self.n_classes_[pos]
+            elif self.honest_prior == "ignore":
+                proba[zero_mask] = np.nan
+            else:
+                raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
+        else:
+            if self.honest_prior == "empirical":
+                proba[zero_mask] = self.empirical_prior_
+            elif self.honest_prior == "uniform":
+                proba[zero_mask] = 1 / self.n_classes_
+            elif self.honest_prior == "ignore":
+                proba[zero_mask] = np.nan
+            else:
+                raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
         return proba
 
     def _impute_missing_classes(self, proba):
@@ -481,15 +486,24 @@ class in a leaf.
             proba /= normalizer
             if self._tree_n_classes_ != self.n_classes_:
                 proba = self._impute_missing_classes(proba)
-            proba = self._empty_leaf_correction(proba, normalizer)
+            proba = self._empty_leaf_correction(proba)
 
             return proba
 
         else:
-            raise NotImplementedError(
-                "Multi-target honest trees not yet \
-                implemented"
-            )
+            all_proba = []
+
+            for k in range(self.n_outputs_):
+                proba_k = proba[:, k, : self._tree_n_classes_[k]]
+                normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+                normalizer[normalizer == 0.0] = 1.0
+                proba_k /= normalizer
+                if self._tree_n_classes_[k] != self.n_classes_[k]:
+                    proba_k = self._impute_missing_classes(proba_k)
+                proba_k = self._empty_leaf_correction(proba_k, k)
+                all_proba.append(proba_k)
+
+            return all_proba
 
     def predict(self, X, check_input=True):
         """Predict class for X.

diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py
@@ -106,7 +106,4 @@ def test_impute_classes():
 
 @parametrize_with_checks([HonestTreeClassifier(random_state=0)])
 def test_sklearn_compatible_estimator(estimator, check):
-    # TODO: remove when we implement Regressor classes
-    # if TREE_ESTIMATORS[estimator].__name__ in TREE_CLASSIFIERS:
-    #     pytest.skip()
     check(estimator)