From 070161a6c534b6503f3ba00ff584390516a761b3 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Fri, 31 May 2024 12:34:40 -0400 Subject: [PATCH] DOC optimize tutorial & change function name --- doc/api.rst | 8 ++++---- examples/treeple/treeple_tutorial_1_1a_SA98.py | 4 ++-- examples/treeple/treeple_tutorial_1_1b_MI.py | 4 ++-- examples/treeple/treeple_tutorial_1_1c_pAUC.py | 4 ++-- examples/treeple/treeple_tutorial_1_1d_HD.py | 4 ++-- examples/treeple/treeple_tutorial_1_2_pvalue.py | 8 +++++++- .../treeple_tutorial_2_1a_SA98_multiview.py | 4 ++-- examples/treeple/treeple_tutorial_2_1b_CMI.py | 6 +++--- .../treeple_tutorial_2_2_pvalue_multiview.py | 9 +++++++-- sktree/stats/__init__.py | 8 ++++---- sktree/stats/forestht.py | 14 +++++++------- 11 files changed, 42 insertions(+), 31 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index f4ef89aed..dd5ba611c 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -103,7 +103,7 @@ The trees that comprise those forests are also available as standalone classes. Outlier Detection ----------------- -Isolation forests are a model implemented in scikit-learn, which is an ensemble of +Isolation forests are a model implemented in scikit-learn, which is an ensemble of extremely randomized axis-aligned decision tree models. Extended isolation forests replaces the base tree model with an oblique tree, which allows a more flexible model for detecting outliers. @@ -151,10 +151,10 @@ tree models. PermutationForestRegressor build_coleman_forest build_permutation_forest - build_hyppo_oob_forest - build_hyppo_cv_forest + build_oob_forest + build_cv_forest PermutationHonestForestClassifier - + Datasets ------------------------------ We provide some convenience functions for simulating datasets beyond diff --git a/examples/treeple/treeple_tutorial_1_1a_SA98.py b/examples/treeple/treeple_tutorial_1_1a_SA98.py index f8b4bdeca..6d53adde3 100644 --- a/examples/treeple/treeple_tutorial_1_1a_SA98.py +++ b/examples/treeple/treeple_tutorial_1_1a_SA98.py @@ -11,7 +11,7 @@ from sktree.datasets import make_trunk_classification from sktree.ensemble import HonestForestClassifier -from sktree.stats import build_hyppo_oob_forest +from sktree.stats import build_oob_forest sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) PALETTE = sns.color_palette("Set1") @@ -76,7 +76,7 @@ ) # fit the model and obtain the tree posteriors -_, observe_proba = build_hyppo_oob_forest(est, X, y) +_, observe_proba = build_oob_forest(est, X, y) # generate forest posteriors for the two classes observe_proba = np.nanmean(observe_proba, axis=0) diff --git a/examples/treeple/treeple_tutorial_1_1b_MI.py b/examples/treeple/treeple_tutorial_1_1b_MI.py index 3379f9e90..36ba00091 100644 --- a/examples/treeple/treeple_tutorial_1_1b_MI.py +++ b/examples/treeple/treeple_tutorial_1_1b_MI.py @@ -11,7 +11,7 @@ from sktree.datasets import make_trunk_classification from sktree.ensemble import HonestForestClassifier -from sktree.stats import build_hyppo_oob_forest +from sktree.stats import build_oob_forest sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) PALETTE = sns.color_palette("Set1") @@ -77,7 +77,7 @@ ) # fit the model and obtain the tree posteriors -_, observe_proba = build_hyppo_oob_forest(est, X, y) +_, observe_proba = build_oob_forest(est, X, y) # generate forest posteriors for the two classes observe_proba = np.nanmean(observe_proba, axis=0) diff --git a/examples/treeple/treeple_tutorial_1_1c_pAUC.py b/examples/treeple/treeple_tutorial_1_1c_pAUC.py index b2391c7b0..708ce19ae 100644 --- a/examples/treeple/treeple_tutorial_1_1c_pAUC.py +++ b/examples/treeple/treeple_tutorial_1_1c_pAUC.py @@ -11,7 +11,7 @@ from sktree.datasets import make_trunk_classification from sktree.ensemble import HonestForestClassifier -from sktree.stats import build_hyppo_oob_forest +from sktree.stats import build_oob_forest sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) PALETTE = sns.color_palette("Set1") @@ -78,7 +78,7 @@ ) # fit the model and obtain the tree posteriors -_, observe_proba = build_hyppo_oob_forest(est, X, y) +_, observe_proba = build_oob_forest(est, X, y) # generate forest posteriors for the two classes observe_proba = np.nanmean(observe_proba, axis=0) diff --git a/examples/treeple/treeple_tutorial_1_1d_HD.py b/examples/treeple/treeple_tutorial_1_1d_HD.py index a24411e89..03f2ff24b 100644 --- a/examples/treeple/treeple_tutorial_1_1d_HD.py +++ b/examples/treeple/treeple_tutorial_1_1d_HD.py @@ -10,7 +10,7 @@ from sktree.datasets import make_trunk_classification from sktree.ensemble import HonestForestClassifier -from sktree.stats import build_hyppo_oob_forest +from sktree.stats import build_oob_forest sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) PALETTE = sns.color_palette("Set1") @@ -75,7 +75,7 @@ ) # fit the model and obtain the tree posteriors -_, observe_proba = build_hyppo_oob_forest(est, X, y) +_, observe_proba = build_oob_forest(est, X, y) # generate forest posteriors for the two classes observe_proba = np.nanmean(observe_proba, axis=0) diff --git a/examples/treeple/treeple_tutorial_1_2_pvalue.py b/examples/treeple/treeple_tutorial_1_2_pvalue.py index b744a13c5..bed25654a 100644 --- a/examples/treeple/treeple_tutorial_1_2_pvalue.py +++ b/examples/treeple/treeple_tutorial_1_2_pvalue.py @@ -43,8 +43,14 @@ # # .. math:: I(X; Y) = H(Y) - H(Y\mid X) # +# Under the null hypothesis :math:`H_0`, the conditional entropy ``H(Y | X)`` +# is equal to the class entropy ``H(Y)``, so the *MI* becomes zero. Thus, if +# the *MI* is significantly larger than zero, we can reject the null hypothesis +# :math:`H_0`. +# # With a binary class simulation as an example, this tutorial will show -# how to use ``treeple`` to use the statistic and the p-value. +# how to use ``treeple`` to calculate the statistic and test the +# hypothesis with data. # %% # Create a simulation with two gaussians diff --git a/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py b/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py index 9e011d9bc..589945db7 100644 --- a/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py +++ b/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py @@ -12,7 +12,7 @@ from sktree.datasets import make_trunk_classification from sktree.ensemble import HonestForestClassifier -from sktree.stats import build_hyppo_oob_forest +from sktree.stats import build_oob_forest from sktree.tree import MultiViewDecisionTreeClassifier sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) @@ -95,7 +95,7 @@ ) # fit the model and obtain the tree posteriors -_, observe_proba = build_hyppo_oob_forest(est, Z_X, y) +_, observe_proba = build_oob_forest(est, Z_X, y) # generate forest posteriors for the two classes observe_proba = np.nanmean(observe_proba, axis=0) diff --git a/examples/treeple/treeple_tutorial_2_1b_CMI.py b/examples/treeple/treeple_tutorial_2_1b_CMI.py index 05d32053c..bf5487cf2 100644 --- a/examples/treeple/treeple_tutorial_2_1b_CMI.py +++ b/examples/treeple/treeple_tutorial_2_1b_CMI.py @@ -12,7 +12,7 @@ from sktree.datasets import make_trunk_classification from sktree.ensemble import HonestForestClassifier -from sktree.stats import build_hyppo_oob_forest +from sktree.stats import build_oob_forest from sktree.tree import MultiViewDecisionTreeClassifier sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) @@ -95,7 +95,7 @@ ) # fit the model and obtain the tree posteriors -_, observe_proba = build_hyppo_oob_forest(est, Z_X, y) +_, observe_proba = build_oob_forest(est, Z_X, y) # generate forest posteriors for the two classes observe_proba = np.nanmean(observe_proba, axis=0) @@ -129,7 +129,7 @@ ) # fit the model and obtain the tree posteriors -_, single_proba = build_hyppo_oob_forest(est, Z, y) +_, single_proba = build_oob_forest(est, Z, y) # generate forest posteriors for the two classes single_proba = np.nanmean(single_proba, axis=0) diff --git a/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py b/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py index 1d3c16112..8106f86fb 100644 --- a/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py +++ b/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py @@ -41,9 +41,14 @@ # # Conditional mutual information (*CMI*) measures the dependence of *Y* on # *X* conditioned on *Z*. It can be calculated by the difference between -# the joint MI (``I([X, Z]; Y)``) and the MI on Z (``I(Y; Z)``): +# the joint *MI* (``I([X, Z]; Y)``) and the *MI* of Y on Z (``I(Y; Z)``): # -# .. math:: I(X; Y | Z) = I([X, Z]; Y) - I(Y; Z) +# .. math:: I(Y; X \mid Z) = I(Y; [X, Z]) - I(Y; Z) +# +# Under the null hypothesis :math:`H_0`, the joint *MI* ``I(Y; [X, Z])`` +# is equal to the *MI* of Y on Z ``I(Y; Z)``, so the *CMI* becomes zero. Thus, if +# the *CMI* is significantly larger than zero, we can reject the null hypothesis +# :math:`H_0`. # # With a multiview binary class simulation as an example, this tutorial # will show how to use ``treeple`` to calculate the statistic and test the diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py index 39d09234b..188694c0a 100644 --- a/sktree/stats/__init__.py +++ b/sktree/stats/__init__.py @@ -2,8 +2,8 @@ FeatureImportanceForestClassifier, FeatureImportanceForestRegressor, build_coleman_forest, - build_hyppo_cv_forest, - build_hyppo_oob_forest, + build_cv_forest, + build_oob_forest, build_permutation_forest, ) from .monte_carlo import PermutationTest @@ -16,8 +16,8 @@ "PermutationForestClassifier", "PermutationForestRegressor", "PermutationTest", - "build_hyppo_cv_forest", - "build_hyppo_oob_forest", + "build_cv_forest", + "build_oob_forest", "build_coleman_forest", "build_permutation_forest", "PermutationHonestForestClassifier", diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 6e925e79e..08b11ba1a 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1294,7 +1294,7 @@ def build_coleman_forest( metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] # build two sets of forests - est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose) + est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose) X_null = np.copy(X) y_null = np.copy(y) @@ -1307,7 +1307,7 @@ def build_coleman_forest( rng.shuffle(temp_col) X_null[:, covariate_index] = temp_col - perm_est, perm_forest_proba = build_hyppo_oob_forest(perm_est, X_null, y_null, verbose=verbose) + perm_est, perm_forest_proba = build_oob_forest(perm_est, X_null, y_null, verbose=verbose) # get the number of jobs n_jobs = est.n_jobs @@ -1433,7 +1433,7 @@ def build_permutation_forest( ) # train the original forest on unpermuted data - est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose) + est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose) y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0) observe_test_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs) @@ -1452,7 +1452,7 @@ def build_permutation_forest( perm_est = clone(perm_est) perm_est.set_params(random_state=rng.integers(0, np.iinfo(np.int32).max)) - perm_est, perm_forest_proba = build_hyppo_oob_forest( + perm_est, perm_forest_proba = build_oob_forest( perm_est, X_perm, y, verbose=verbose, covariate_index=covariate_index ) @@ -1474,7 +1474,7 @@ def build_permutation_forest( return forest_result -def build_hyppo_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): +def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): """Build a hypothesis testing forest using oob samples. Parameters @@ -1532,7 +1532,7 @@ def build_hyppo_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwa return est, all_proba -def build_hyppo_cv_forest( +def build_cv_forest( est, X, y, @@ -1541,7 +1541,7 @@ def build_hyppo_cv_forest( verbose=False, seed=None, ): - """Build a hypothesis testing forest using oob samples. + """Build a hypothesis testing forest using using cross-validation. Parameters ----------