From 9a71e86cd14c1d55470a346260ec8437e3f0fe88 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 3 Jul 2024 12:21:09 -0400 Subject: [PATCH 01/14] FIX ignore nan values when summing posteriors --- sktree/ensemble/_honest_forest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 2ea11ea93..685966181 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -823,7 +823,7 @@ def _accumulate_prediction(predict, X, out, lock, indices=None): with lock: if len(out) == 1: - out[0][indices] += proba + out[0][indices] = np.nansum([out[0][indices], proba], axis=0) else: for i in range(len(out)): - out[i][indices] += proba[i] + out[i][indices] = np.nansum([out[i][indices], proba[i]], axis=0) From a4e5305b2f5540de2e73521256cfed6866519322 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 3 Jul 2024 12:45:47 -0400 Subject: [PATCH 02/14] FIX correct imputation for missing posteriors --- sktree/ensemble/_honest_forest.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 685966181..d3b67d0a8 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -672,10 +672,7 @@ def _predict_proba(self, X, indices=None, impute_missing=None): zero_mask = posteriors.sum(2) == 0 posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True) - if impute_missing is None: - pass - else: - posteriors[zero_mask] = impute_missing + posteriors[zero_mask] = impute_missing # preserve shape of multi-outputs if self.n_outputs_ > 1: From b6232861e9582dbc36b0e0deab22162c9943fa4f Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 3 Jul 2024 14:27:01 -0400 Subject: [PATCH 03/14] TST add test about 100 tree ignore settings --- sktree/tests/test_honest_forest.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index 3610475d6..46aae41c5 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -263,6 +263,27 @@ def test_impute_posteriors(honest_prior, val): ), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}" +@pytest.mark.parametrize( + "honest_prior, val", + [ + ("ignore", np.nan), + ], +) +def test_ignore_posteriors(honest_prior, val): + X = rng.normal(0, 1, (100, 2)) + y = [0] * 75 + [1] * 25 + clf = HonestForestClassifier( + honest_fraction=0.5, random_state=0, honest_prior=honest_prior, n_estimators=100 + ) + clf = clf.fit(X, y) + + y_proba = clf.predict_proba(X) + + assert ( + len(np.where(np.isnan(y_proba[:, 0]))[0]) < 10 + ), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}" + + @pytest.mark.parametrize( "honest_fraction, val", [ From 7462ad7f63de8973621a7fc89933548c29d5b741 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 3 Jul 2024 14:44:15 -0400 Subject: [PATCH 04/14] DOC update whats_new --- doc/whats_new/v0.8.rst | 2 +- doc/whats_new/v0.9.rst | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index a0949489d..e705cf292 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -40,4 +40,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version inception, including: * `Adam Li`_ - +* `Sambit Panda`_ diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst index 9c5ffb3b2..d7dbee3b2 100644 --- a/doc/whats_new/v0.9.rst +++ b/doc/whats_new/v0.9.rst @@ -13,7 +13,10 @@ Version 0.9 Changelog --------- -- +- |Fix| Fixed a bug in the :class:`sktree.HonestForestClassifier` where posteriors + estimated on empty leaf with ``ignore`` prior would result in ``np.nan`` + values for all trees on that sample. + By `Haoyin Xu`_ (:pr:`#291`) Code and Documentation Contributors ----------------------------------- @@ -21,5 +24,4 @@ Code and Documentation Contributors Thanks to everyone who has contributed to the maintenance and improvement of the project since version inception, including: -* `Adam Li`_ - +* `Haoyin Xu`_ From d48bad7265691fdee222378677e72374af7c5126 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 3 Jul 2024 15:13:44 -0400 Subject: [PATCH 05/14] FIX add seed to test & set ignore as default prior --- sktree/ensemble/_honest_forest.py | 4 ++-- sktree/stats/tests/test_forestht.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index d3b67d0a8..3d662c9a0 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -259,7 +259,7 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin): - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - honest_prior : {"ignore", "uniform", "empirical"}, default="empirical" + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" Method for dealing with empty leaves during evaluation of a test sample. If "ignore", the tree is ignored. If "uniform", the prior tree posterior is 1/(number of classes). If "empirical", the prior tree @@ -444,7 +444,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, - honest_prior="empirical", + honest_prior="ignore", honest_fraction=0.5, tree_estimator=None, stratify=False, diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index f4b397ffc..77912d55d 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -157,12 +157,15 @@ def test_small_dataset_dependent(seed): n_repeats=1000, metric="mi", return_posteriors=False, + seed=seed, ) assert ~np.isnan(result.pvalue) assert ~np.isnan(result.observe_test_stat) assert result.pvalue <= 0.05 - result = build_coleman_forest(clf, perm_clf, X, y, metric="mi", return_posteriors=False) + result = build_coleman_forest( + clf, perm_clf, X, y, metric="mi", return_posteriors=False, seed=seed + ) assert result.pvalue <= 0.05 From f5eff85376aac26e2432165c9478f45fa3c1c1c6 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Fri, 5 Jul 2024 16:21:43 -0400 Subject: [PATCH 06/14] Update sktree/tests/test_honest_forest.py Co-authored-by: Adam Li --- sktree/tests/test_honest_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index 46aae41c5..bc87588f2 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -269,7 +269,7 @@ def test_impute_posteriors(honest_prior, val): ("ignore", np.nan), ], ) -def test_ignore_posteriors(honest_prior, val): +def test_honestforest_predict_proba_with_honest_prior(honest_prior, val): X = rng.normal(0, 1, (100, 2)) y = [0] * 75 + [1] * 25 clf = HonestForestClassifier( From de57d4d5e9cf0fb8c018105826e2c44b8f60fe01 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Fri, 5 Jul 2024 16:32:48 -0400 Subject: [PATCH 07/14] FIX correct unit tests --- sktree/stats/tests/test_forestht.py | 1 + sktree/tests/test_extensions.py | 5 ++++- sktree/tests/test_honest_forest.py | 12 ++++-------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 77912d55d..9b009e646 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -139,6 +139,7 @@ def test_small_dataset_dependent(seed): honest_fraction=0.5, bootstrap=True, max_samples=1.6, + honest_prior='empirical', ) perm_clf = PermutationHonestForestClassifier( n_estimators=n_estimators, diff --git a/sktree/tests/test_extensions.py b/sktree/tests/test_extensions.py index c0b74e8c2..017e433a2 100644 --- a/sktree/tests/test_extensions.py +++ b/sktree/tests/test_extensions.py @@ -32,7 +32,10 @@ def test_predict_proba_per_tree(Forest, n_classes): ) # Call the method being tested - est = Forest(n_estimators=10, bootstrap=True, random_state=0) + if Forest == HonestForestClassifier: + est = Forest(n_estimators=10, bootstrap=True, random_state=0, honest_prior="empirical") + else: + est = Forest(n_estimators=10, bootstrap=True, random_state=0) est.fit(X, y) proba_per_tree = est.predict_proba_per_tree(X) diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index bc87588f2..11082ba37 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -263,15 +263,10 @@ def test_impute_posteriors(honest_prior, val): ), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}" -@pytest.mark.parametrize( - "honest_prior, val", - [ - ("ignore", np.nan), - ], -) -def test_honestforest_predict_proba_with_honest_prior(honest_prior, val): +def test_honestforest_predict_proba_with_honest_prior(): X = rng.normal(0, 1, (100, 2)) y = [0] * 75 + [1] * 25 + honest_prior = "ignore" clf = HonestForestClassifier( honest_fraction=0.5, random_state=0, honest_prior=honest_prior, n_estimators=100 ) @@ -279,8 +274,9 @@ def test_honestforest_predict_proba_with_honest_prior(honest_prior, val): y_proba = clf.predict_proba(X) + # With enough trees no nan values should exist assert ( - len(np.where(np.isnan(y_proba[:, 0]))[0]) < 10 + len(np.where(np.isnan(y_proba[:, 0]))[0]) == 0 ), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}" From 4ba31a4938dd88ac7075bf1cdad752cb87aa47af Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jul 2024 20:33:06 +0000 Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sktree/stats/tests/test_forestht.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 9b009e646..9121b5088 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -139,7 +139,7 @@ def test_small_dataset_dependent(seed): honest_fraction=0.5, bootstrap=True, max_samples=1.6, - honest_prior='empirical', + honest_prior="empirical", ) perm_clf = PermutationHonestForestClassifier( n_estimators=n_estimators, From 9126a9daae0ca4577d22b040898e68885778d68c Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Fri, 5 Jul 2024 18:41:36 -0400 Subject: [PATCH 09/14] FIX increase sample number to be consistent --- sktree/stats/tests/test_forestht.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 9121b5088..cfb28a796 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -119,11 +119,10 @@ def test_small_dataset_independent(seed): @flaky(max_runs=3) @pytest.mark.parametrize("seed", [10, 0]) def test_small_dataset_dependent(seed): - n_samples = 50 + n_samples = 100 n_features = 5 rng = np.random.default_rng(seed) - X = rng.uniform(size=(n_samples, n_features)) X = rng.uniform(size=(n_samples // 2, n_features)) X2 = X + 3 X = np.vstack([X, X2]) @@ -139,7 +138,6 @@ def test_small_dataset_dependent(seed): honest_fraction=0.5, bootstrap=True, max_samples=1.6, - honest_prior="empirical", ) perm_clf = PermutationHonestForestClassifier( n_estimators=n_estimators, From 3e2cda8191648dff05e17f9fa64465c74dcb6b2a Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 9 Jul 2024 16:22:42 -0400 Subject: [PATCH 10/14] FIX add condition on impute_missing --- sktree/ensemble/_honest_forest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 3d662c9a0..f9ba3ec5d 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -672,7 +672,8 @@ def _predict_proba(self, X, indices=None, impute_missing=None): zero_mask = posteriors.sum(2) == 0 posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True) - posteriors[zero_mask] = impute_missing + if impute_missing is not None: + posteriors[zero_mask] = impute_missing # preserve shape of multi-outputs if self.n_outputs_ > 1: From 19ed6f39c764587a0fe65d9ee1b6686a2e8aca62 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 9 Jul 2024 16:39:51 -0400 Subject: [PATCH 11/14] Revert "FIX add condition on impute_missing" This reverts commit 3e2cda8191648dff05e17f9fa64465c74dcb6b2a. --- treeple/ensemble/_honest_forest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/treeple/ensemble/_honest_forest.py b/treeple/ensemble/_honest_forest.py index 04783221c..cba859a04 100644 --- a/treeple/ensemble/_honest_forest.py +++ b/treeple/ensemble/_honest_forest.py @@ -672,8 +672,7 @@ def _predict_proba(self, X, indices=None, impute_missing=None): zero_mask = posteriors.sum(2) == 0 posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True) - if impute_missing is not None: - posteriors[zero_mask] = impute_missing + posteriors[zero_mask] = impute_missing # preserve shape of multi-outputs if self.n_outputs_ > 1: From 06ba512a1ef4ea00ac9c31e0cccb2b43a11c7df9 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 9 Jul 2024 16:42:02 -0400 Subject: [PATCH 12/14] FIX update impute_missing default --- treeple/ensemble/_honest_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/treeple/ensemble/_honest_forest.py b/treeple/ensemble/_honest_forest.py index cba859a04..96c010625 100644 --- a/treeple/ensemble/_honest_forest.py +++ b/treeple/ensemble/_honest_forest.py @@ -648,7 +648,7 @@ def predict_proba(self, X): """ return self._predict_proba(X) - def _predict_proba(self, X, indices=None, impute_missing=None): + def _predict_proba(self, X, indices=None, impute_missing=np.nan): """predict_proba helper class""" check_is_fitted(self) X = self._validate_X_predict(X) From b3eb6d34a0b81365d9f624202833b6e653634439 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 10 Jul 2024 10:03:11 -0400 Subject: [PATCH 13/14] DOC correct package name --- doc/whats_new/v0.9.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst index 3220f92d0..b217cf8d0 100644 --- a/doc/whats_new/v0.9.rst +++ b/doc/whats_new/v0.9.rst @@ -10,7 +10,7 @@ Version 0.9 **In Development** -This release include a rename of the package to from ``scikit-learn`` to ``treeple`` +This release include a rename of the package to from ``scikit-tree`` to ``treeple`` The users can replace the previous usage as follows: ``import sktree`` to ``import treeple`` ``from sktree import tree`` to ``from treeple import tree`` @@ -21,7 +21,7 @@ Changelog --------- - |API| Rename the package to ``treeple``. By `SUKI-O`_ (:pr:`#292`) -- |Fix| Fixed a bug in the :class:`sktree.HonestForestClassifier` where posteriors +- |Fix| Fixed a bug in the :class:`treeple.HonestForestClassifier` where posteriors estimated on empty leaf with ``ignore`` prior would result in ``np.nan`` values for all trees on that sample. By `Haoyin Xu`_ (:pr:`#291`) From 09aa2e936524620eac465993b86150d8c92a0949 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 15 Jul 2024 12:39:44 -0400 Subject: [PATCH 14/14] Update doc/whats_new/v0.9.rst --- doc/whats_new/v0.9.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst index b217cf8d0..a3688111b 100644 --- a/doc/whats_new/v0.9.rst +++ b/doc/whats_new/v0.9.rst @@ -21,7 +21,7 @@ Changelog --------- - |API| Rename the package to ``treeple``. By `SUKI-O`_ (:pr:`#292`) -- |Fix| Fixed a bug in the :class:`treeple.HonestForestClassifier` where posteriors +- |Fix| Fixed a bug in the predict_proba function of the :class:`treeple.HonestForestClassifier` where posteriors estimated on empty leaf with ``ignore`` prior would result in ``np.nan`` values for all trees on that sample. By `Haoyin Xu`_ (:pr:`#291`)