neurodata · adam2392 · Jul 15, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst
@@ -40,4 +40,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version inception, including:
 
 * `Adam Li`_
-
+* `Sambit Panda`_
diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst
@@ -13,13 +13,15 @@ Version 0.9
 Changelog
 ---------
 
-- 
+- |Fix| Fixed a bug in the :class:`sktree.HonestForestClassifier` where posteriors
+    estimated on empty leaf with ``ignore`` prior would result in ``np.nan``
+    values for all trees on that sample.
+    By `Haoyin Xu`_ (:pr:`#291`)
 
 Code and Documentation Contributors
 -----------------------------------
 
 Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version inception, including:
 
-* `Adam Li`_
-
+* `Haoyin Xu`_
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
@@ -259,7 +259,7 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin):
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples.
 
-    honest_prior : {"ignore", "uniform", "empirical"}, default="empirical"
+    honest_prior : {"ignore", "uniform", "empirical"}, default="ignore"
         Method for dealing with empty leaves during evaluation of a test
         sample. If "ignore", the tree is ignored. If "uniform", the prior tree
         posterior is 1/(number of classes). If "empirical", the prior tree
@@ -444,7 +444,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
-        honest_prior="empirical",
+        honest_prior="ignore",
         honest_fraction=0.5,
         tree_estimator=None,
         stratify=False,
@@ -672,10 +672,7 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
         zero_mask = posteriors.sum(2) == 0
         posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)
 
-        if impute_missing is None:
-            pass
-        else:
-            posteriors[zero_mask] = impute_missing
+        posteriors[zero_mask] = impute_missing
 
         # preserve shape of multi-outputs
         if self.n_outputs_ > 1:
@@ -823,7 +820,7 @@ def _accumulate_prediction(predict, X, out, lock, indices=None):
 
     with lock:
         if len(out) == 1:
-            out[0][indices] += proba
+            out[0][indices] = np.nansum([out[0][indices], proba], axis=0)
         else:
             for i in range(len(out)):
-                out[i][indices] += proba[i]
+                out[i][indices] = np.nansum([out[i][indices], proba[i]], axis=0)
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
@@ -157,12 +157,15 @@ def test_small_dataset_dependent(seed):
         n_repeats=1000,
         metric="mi",
         return_posteriors=False,
+        seed=seed,
     )
     assert ~np.isnan(result.pvalue)
     assert ~np.isnan(result.observe_test_stat)
     assert result.pvalue <= 0.05
 
-    result = build_coleman_forest(clf, perm_clf, X, y, metric="mi", return_posteriors=False)
+    result = build_coleman_forest(
+        clf, perm_clf, X, y, metric="mi", return_posteriors=False, seed=seed
+    )
     assert result.pvalue <= 0.05
 
 

diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
@@ -263,6 +263,27 @@ def test_impute_posteriors(honest_prior, val):
         ), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}"
 
 
+@pytest.mark.parametrize(
+    "honest_prior, val",
+    [
+        ("ignore", np.nan),
+    ],
+)
+def test_ignore_posteriors(honest_prior, val):
+    X = rng.normal(0, 1, (100, 2))
+    y = [0] * 75 + [1] * 25
+    clf = HonestForestClassifier(
+        honest_fraction=0.5, random_state=0, honest_prior=honest_prior, n_estimators=100
+    )
+    clf = clf.fit(X, y)
+
+    y_proba = clf.predict_proba(X)
+
+    assert (
+        len(np.where(np.isnan(y_proba[:, 0]))[0]) < 10
+    ), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}"
+
+
 @pytest.mark.parametrize(
     "honest_fraction, val",
     [