From 070161a6c534b6503f3ba00ff584390516a761b3 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Fri, 31 May 2024 12:34:40 -0400
Subject: [PATCH] DOC optimize tutorial & change function name

---
 doc/api.rst                                        |  8 ++++----
 examples/treeple/treeple_tutorial_1_1a_SA98.py     |  4 ++--
 examples/treeple/treeple_tutorial_1_1b_MI.py       |  4 ++--
 examples/treeple/treeple_tutorial_1_1c_pAUC.py     |  4 ++--
 examples/treeple/treeple_tutorial_1_1d_HD.py       |  4 ++--
 examples/treeple/treeple_tutorial_1_2_pvalue.py    |  8 +++++++-
 .../treeple_tutorial_2_1a_SA98_multiview.py        |  4 ++--
 examples/treeple/treeple_tutorial_2_1b_CMI.py      |  6 +++---
 .../treeple_tutorial_2_2_pvalue_multiview.py       |  9 +++++++--
 sktree/stats/__init__.py                           |  8 ++++----
 sktree/stats/forestht.py                           | 14 +++++++-------
 11 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index f4ef89aed..dd5ba611c 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -103,7 +103,7 @@ The trees that comprise those forests are also available as standalone classes.
 
 Outlier Detection
 -----------------
-Isolation forests are a model implemented in scikit-learn, which is an ensemble of 
+Isolation forests are a model implemented in scikit-learn, which is an ensemble of
 extremely randomized axis-aligned decision tree models. Extended isolation forests
 replaces the base tree model with an oblique tree, which allows a more flexible model
 for detecting outliers.
@@ -151,10 +151,10 @@ tree models.
    PermutationForestRegressor
    build_coleman_forest
    build_permutation_forest
-   build_hyppo_oob_forest
-   build_hyppo_cv_forest
+   build_oob_forest
+   build_cv_forest
    PermutationHonestForestClassifier
-   
+
 Datasets
 ------------------------------
 We provide some convenience functions for simulating datasets beyond
diff --git a/examples/treeple/treeple_tutorial_1_1a_SA98.py b/examples/treeple/treeple_tutorial_1_1a_SA98.py
index f8b4bdeca..6d53adde3 100644
--- a/examples/treeple/treeple_tutorial_1_1a_SA98.py
+++ b/examples/treeple/treeple_tutorial_1_1a_SA98.py
@@ -11,7 +11,7 @@
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
-from sktree.stats import build_hyppo_oob_forest
+from sktree.stats import build_oob_forest
 
 sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 PALETTE = sns.color_palette("Set1")
@@ -76,7 +76,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, X, y)
+_, observe_proba = build_oob_forest(est, X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
diff --git a/examples/treeple/treeple_tutorial_1_1b_MI.py b/examples/treeple/treeple_tutorial_1_1b_MI.py
index 3379f9e90..36ba00091 100644
--- a/examples/treeple/treeple_tutorial_1_1b_MI.py
+++ b/examples/treeple/treeple_tutorial_1_1b_MI.py
@@ -11,7 +11,7 @@
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
-from sktree.stats import build_hyppo_oob_forest
+from sktree.stats import build_oob_forest
 
 sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 PALETTE = sns.color_palette("Set1")
@@ -77,7 +77,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, X, y)
+_, observe_proba = build_oob_forest(est, X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
diff --git a/examples/treeple/treeple_tutorial_1_1c_pAUC.py b/examples/treeple/treeple_tutorial_1_1c_pAUC.py
index b2391c7b0..708ce19ae 100644
--- a/examples/treeple/treeple_tutorial_1_1c_pAUC.py
+++ b/examples/treeple/treeple_tutorial_1_1c_pAUC.py
@@ -11,7 +11,7 @@
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
-from sktree.stats import build_hyppo_oob_forest
+from sktree.stats import build_oob_forest
 
 sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 PALETTE = sns.color_palette("Set1")
@@ -78,7 +78,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, X, y)
+_, observe_proba = build_oob_forest(est, X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
diff --git a/examples/treeple/treeple_tutorial_1_1d_HD.py b/examples/treeple/treeple_tutorial_1_1d_HD.py
index a24411e89..03f2ff24b 100644
--- a/examples/treeple/treeple_tutorial_1_1d_HD.py
+++ b/examples/treeple/treeple_tutorial_1_1d_HD.py
@@ -10,7 +10,7 @@
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
-from sktree.stats import build_hyppo_oob_forest
+from sktree.stats import build_oob_forest
 
 sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 PALETTE = sns.color_palette("Set1")
@@ -75,7 +75,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, X, y)
+_, observe_proba = build_oob_forest(est, X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
diff --git a/examples/treeple/treeple_tutorial_1_2_pvalue.py b/examples/treeple/treeple_tutorial_1_2_pvalue.py
index b744a13c5..bed25654a 100644
--- a/examples/treeple/treeple_tutorial_1_2_pvalue.py
+++ b/examples/treeple/treeple_tutorial_1_2_pvalue.py
@@ -43,8 +43,14 @@
 #
 # .. math:: I(X; Y) = H(Y) - H(Y\mid X)
 #
+# Under the null hypothesis :math:`H_0`, the conditional entropy ``H(Y | X)``
+# is equal to the class entropy ``H(Y)``, so the *MI* becomes zero. Thus, if
+# the *MI* is significantly larger than zero, we can reject the null hypothesis
+# :math:`H_0`.
+#
 # With a binary class simulation as an example, this tutorial will show
-# how to use ``treeple`` to use the statistic and the p-value.
+# how to use ``treeple`` to calculate the statistic and test the
+# hypothesis with data.
 
 # %%
 # Create a simulation with two gaussians
diff --git a/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py b/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py
index 9e011d9bc..589945db7 100644
--- a/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py
+++ b/examples/treeple/treeple_tutorial_2_1a_SA98_multiview.py
@@ -12,7 +12,7 @@
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
-from sktree.stats import build_hyppo_oob_forest
+from sktree.stats import build_oob_forest
 from sktree.tree import MultiViewDecisionTreeClassifier
 
 sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
@@ -95,7 +95,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, Z_X, y)
+_, observe_proba = build_oob_forest(est, Z_X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
diff --git a/examples/treeple/treeple_tutorial_2_1b_CMI.py b/examples/treeple/treeple_tutorial_2_1b_CMI.py
index 05d32053c..bf5487cf2 100644
--- a/examples/treeple/treeple_tutorial_2_1b_CMI.py
+++ b/examples/treeple/treeple_tutorial_2_1b_CMI.py
@@ -12,7 +12,7 @@
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
-from sktree.stats import build_hyppo_oob_forest
+from sktree.stats import build_oob_forest
 from sktree.tree import MultiViewDecisionTreeClassifier
 
 sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
@@ -95,7 +95,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, Z_X, y)
+_, observe_proba = build_oob_forest(est, Z_X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
@@ -129,7 +129,7 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, single_proba = build_hyppo_oob_forest(est, Z, y)
+_, single_proba = build_oob_forest(est, Z, y)
 
 # generate forest posteriors for the two classes
 single_proba = np.nanmean(single_proba, axis=0)
diff --git a/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py b/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py
index 1d3c16112..8106f86fb 100644
--- a/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py
+++ b/examples/treeple/treeple_tutorial_2_2_pvalue_multiview.py
@@ -41,9 +41,14 @@
 #
 # Conditional mutual information (*CMI*) measures the dependence of *Y* on
 # *X* conditioned on *Z*. It can be calculated by the difference between
-# the joint MI (``I([X, Z]; Y)``) and the MI on Z (``I(Y; Z)``):
+# the joint *MI* (``I([X, Z]; Y)``) and the *MI* of Y on Z (``I(Y; Z)``):
 #
-# .. math:: I(X; Y | Z) = I([X, Z]; Y) - I(Y; Z)
+# .. math:: I(Y; X \mid Z) = I(Y; [X, Z]) - I(Y; Z)
+#
+# Under the null hypothesis :math:`H_0`, the joint *MI* ``I(Y; [X, Z])``
+# is equal to the *MI* of Y on Z ``I(Y; Z)``, so the *CMI* becomes zero. Thus, if
+# the *CMI* is significantly larger than zero, we can reject the null hypothesis
+# :math:`H_0`.
 #
 # With a multiview binary class simulation as an example, this tutorial
 # will show how to use ``treeple`` to calculate the statistic and test the
diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py
index 39d09234b..188694c0a 100644
--- a/sktree/stats/__init__.py
+++ b/sktree/stats/__init__.py
@@ -2,8 +2,8 @@
     FeatureImportanceForestClassifier,
     FeatureImportanceForestRegressor,
     build_coleman_forest,
-    build_hyppo_cv_forest,
-    build_hyppo_oob_forest,
+    build_cv_forest,
+    build_oob_forest,
     build_permutation_forest,
 )
 from .monte_carlo import PermutationTest
@@ -16,8 +16,8 @@
     "PermutationForestClassifier",
     "PermutationForestRegressor",
     "PermutationTest",
-    "build_hyppo_cv_forest",
-    "build_hyppo_oob_forest",
+    "build_cv_forest",
+    "build_oob_forest",
     "build_coleman_forest",
     "build_permutation_forest",
     "PermutationHonestForestClassifier",
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 6e925e79e..08b11ba1a 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -1294,7 +1294,7 @@ def build_coleman_forest(
     metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]
 
     # build two sets of forests
-    est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
+    est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)
 
     X_null = np.copy(X)
     y_null = np.copy(y)
@@ -1307,7 +1307,7 @@ def build_coleman_forest(
         rng.shuffle(temp_col)
         X_null[:, covariate_index] = temp_col
 
-    perm_est, perm_forest_proba = build_hyppo_oob_forest(perm_est, X_null, y_null, verbose=verbose)
+    perm_est, perm_forest_proba = build_oob_forest(perm_est, X_null, y_null, verbose=verbose)
 
     # get the number of jobs
     n_jobs = est.n_jobs
@@ -1433,7 +1433,7 @@ def build_permutation_forest(
         )
 
     # train the original forest on unpermuted data
-    est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
+    est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)
     y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
     observe_test_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
 
@@ -1452,7 +1452,7 @@ def build_permutation_forest(
         perm_est = clone(perm_est)
         perm_est.set_params(random_state=rng.integers(0, np.iinfo(np.int32).max))
 
-        perm_est, perm_forest_proba = build_hyppo_oob_forest(
+        perm_est, perm_forest_proba = build_oob_forest(
             perm_est, X_perm, y, verbose=verbose, covariate_index=covariate_index
         )
 
@@ -1474,7 +1474,7 @@ def build_permutation_forest(
         return forest_result
 
 
-def build_hyppo_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
+def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
     """Build a hypothesis testing forest using oob samples.
 
     Parameters
@@ -1532,7 +1532,7 @@ def build_hyppo_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwa
     return est, all_proba
 
 
-def build_hyppo_cv_forest(
+def build_cv_forest(
     est,
     X,
     y,
@@ -1541,7 +1541,7 @@ def build_hyppo_cv_forest(
     verbose=False,
     seed=None,
 ):
-    """Build a hypothesis testing forest using oob samples.
+    """Build a hypothesis testing forest using using cross-validation.
 
     Parameters
     ----------