From a3a002dedbf02c8e262b308870566b7a7d3414a3 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 16 Oct 2023 14:29:47 -0400
Subject: [PATCH 01/26] Add option to permute per forest fraction

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 4d6dc7b77..48c7fb4aa 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -106,6 +106,7 @@ def _parallel_build_trees_and_compute_posteriors(
     posterior_arr[idx, indices_test, :] = y_pred  # posterior
 
 
+# TODO: add support permute_per_forest_fraction option
 class BaseForestHT(MetaEstimatorMixin):
     observe_samples_: ArrayLike
     observe_posteriors_: ArrayLike

From 5277a5bad8d799f68b5d809f5357b6d8bca97cb6 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 16 Oct 2023 16:03:31 -0400
Subject: [PATCH 02/26] Add sep parallel func for building and predicting

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 68 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 48c7fb4aa..2afa97c67 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -31,6 +31,74 @@
 )
 
 
+def _parallel_predict_proba(predict_proba, X, indices_test):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    # each tree predicts proba with a list of output (n_samples, n_classes[i])
+    prediction = predict_proba(X[indices_test, :], check_input=False)
+    return prediction
+
+
+def _parallel_build_trees_with_sepdata(
+    tree: Union[DecisionTreeClassifier, DecisionTreeRegressor],
+    n_trees: int,
+    idx: int,
+    indices_train: ArrayLike,
+    X: ArrayLike,
+    y: ArrayLike,
+    covariate_index,
+    bootstrap: bool,
+    max_samples,
+    sample_weight: ArrayLike = None,
+    class_weight=None,
+    missing_values_in_feature_mask=None,
+    classes=None,
+    random_state=None,
+):
+    """Parallel function to build trees and compute posteriors.
+
+    This inherently assumes that the caller function defines the indices
+    for the training and testing data for each tree.
+    """
+    rng = np.random.default_rng(random_state)
+    X_train = X[indices_train, :]
+    y_train = y[indices_train, ...]
+
+    if bootstrap:
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples=X_train.shape[0], max_samples=max_samples
+        )
+    else:
+        n_samples_bootstrap = None
+
+    # individual tree permutation of y labels
+    if covariate_index is not None:
+        indices = np.arange(X_train.shape[0], dtype=int)
+        # perform permutation of covariates
+        index_arr = rng.choice(indices, size=(X_train.shape[0], 1), replace=False, shuffle=True)
+        perm_X_cov = X_train[index_arr, covariate_index]
+        X_train[:, covariate_index] = perm_X_cov
+
+    tree = _parallel_build_trees(
+        tree,
+        bootstrap,
+        X_train,
+        y_train,
+        sample_weight,
+        idx,
+        n_trees,
+        verbose=0,
+        class_weight=class_weight,
+        n_samples_bootstrap=n_samples_bootstrap,
+        missing_values_in_feature_mask=missing_values_in_feature_mask,
+        classes=classes,
+    )
+    return tree
+
 def _parallel_build_trees_and_compute_posteriors(
     forest: BaseForest,
     idx: int,

From df3a1b154a7e6e423591701c1587238d658d6f3b Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 16 Oct 2023 16:08:08 -0400
Subject: [PATCH 03/26] Finished adding

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 131 ++++++++++++++++++++++++++++-----------
 1 file changed, 95 insertions(+), 36 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 2afa97c67..6439d8dfe 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -779,24 +779,41 @@ def _statistic(
         # both sampling dataset per tree or permuting per tree requires us to bypass the
         # sklearn API to fit each tree individually
         if self.sample_dataset_per_tree or self.permute_per_tree:
-            Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
-                delayed(_parallel_build_trees_and_compute_posteriors)(
-                    estimator,
+            # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
+            #     delayed(_parallel_build_trees_and_compute_posteriors)(
+            #         estimator,
+            #         idx,
+            #         indices_train,
+            #         indices_test,
+            #         X,
+            #         y,
+            #         covariate_index,
+            #         posterior_arr,
+            #         False,
+            #         self.permute_per_tree,
+            #         self._type_of_target_,
+            #     )
+            #     for idx, (indices_train, indices_test) in enumerate(
+            #         self._get_estimators_indices(sample_separate=True)
+            #     )
+            # )
+
+            trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
+                delayed(_parallel_build_trees_with_sepdata)(
+                    estimator.estimators_[idx],
+                    len(estimator.estimators_),
                     idx,
                     indices_train,
-                    indices_test,
                     X,
                     y,
                     covariate_index,
-                    posterior_arr,
-                    False,
-                    self.permute_per_tree,
-                    self._type_of_target_,
-                )
-                for idx, (indices_train, indices_test) in enumerate(
-                    self._get_estimators_indices(sample_separate=True)
+                    bootstrap=estimator.bootstrap,
+                    max_samples=estimator.max_samples,
+                    random_state=random_states[idx],
                 )
+                for idx, (indices_train, _) in enumerate(self._get_estimators_indices())
             )
+            estimator.estimators_ = trees
         else:
             # fitting a forest will only get one unique train/test split
             indices_train, indices_test = self.train_test_samples_[0]
@@ -820,15 +837,24 @@ def _statistic(
             estimator.fit(X_train, y_train)
 
             # construct posterior array for all trees (n_trees, n_samples_test, n_outputs)
-            for itree, tree in enumerate(estimator.estimators_):
-                posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape(
-                    -1, tree.n_outputs_
-                )
+            # for itree, tree in enumerate(estimator.estimators_):
+            #     posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape(
+            #         -1, tree.n_outputs_
+            #     )
 
             # set variables to compute metric
             samples = indices_test
             y_true_final = y_test
 
+        # accumulate the predictions across all trees
+        all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+                for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+            )
+        for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())):
+            _, indices_test = est_indices
+            posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
+
         # determine if there are any nans in the final posterior array, when
         # averaged over the trees
         samples = _non_nan_samples(posterior_arr)
@@ -1000,24 +1026,41 @@ def _statistic(
         # both sampling dataset per tree or permuting per tree requires us to bypass the
         # sklearn API to fit each tree individually
         if self.sample_dataset_per_tree or self.permute_per_tree:
-            Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
-                delayed(_parallel_build_trees_and_compute_posteriors)(
-                    estimator,
+            # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
+            #     delayed(_parallel_build_trees_and_compute_posteriors)(
+            #         estimator,
+            #         idx,
+            #         indices_train,
+            #         indices_test,
+            #         X,
+            #         y,
+            #         covariate_index,
+            #         posterior_arr,
+            #         predict_posteriors,
+            #         self.permute_per_tree,
+            #         self._type_of_target_,
+            #     )
+            #     for idx, (indices_train, indices_test) in enumerate(
+            #         self._get_estimators_indices(sample_separate=True)
+            #     )
+            # )
+
+            trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
+                delayed(_parallel_build_trees_with_sepdata)(
+                    estimator.estimators_[idx],
+                    len(estimator.estimators_),
                     idx,
                     indices_train,
-                    indices_test,
                     X,
                     y,
                     covariate_index,
-                    posterior_arr,
-                    predict_posteriors,
-                    self.permute_per_tree,
-                    self._type_of_target_,
-                )
-                for idx, (indices_train, indices_test) in enumerate(
-                    self._get_estimators_indices(sample_separate=True)
+                    bootstrap=estimator.bootstrap,
+                    max_samples=estimator.max_samples,
+                    random_state=random_states[idx],
                 )
+                for idx, (indices_train, _) in enumerate(self._get_estimators_indices())
             )
+            estimator.estimators_ = trees
         else:
             # fitting a forest will only get one unique train/test split
             indices_train, indices_test = self.train_test_samples_[0]
@@ -1041,19 +1084,35 @@ def _statistic(
             estimator.fit(X_train, y_train)
 
             # construct posterior array for all trees (n_trees, n_samples_test, n_outputs)
-            for itree, tree in enumerate(estimator.estimators_):
-                if predict_posteriors:
-                    # XXX: currently assumes n_outputs_ == 1
-                    posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape(
-                        -1, tree.n_classes_
-                    )
-                else:
-                    posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape(
-                        -1, tree.n_outputs_
-                    )
+            # for itree, tree in enumerate(estimator.estimators_):
+            #     if predict_posteriors:
+            #         # XXX: currently assumes n_outputs_ == 1
+            #         posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape(
+            #             -1, tree.n_classes_
+            #         )
+            #     else:
+            #         posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape(
+            #             -1, tree.n_outputs_
+            #         )
 
             # set variables to compute metric
             samples = indices_test
+
+        # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,)
+        if predict_posteriors:
+            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict_proba, X, indices_test)
+                for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+            )
+        else:
+            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+                for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+            )
+        for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())):
+            _, indices_test = est_indices
+            posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
+
         if metric == "auc":
             # at this point, posterior_final is the predicted posterior for only the positive class
             # as more than one output is not supported.

From d46f1ad4615f2946a1a4bf1c7c058050ec350704 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 16 Oct 2023 16:13:53 -0400
Subject: [PATCH 04/26] Modify parallel building

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 6439d8dfe..6d00c4118 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -99,6 +99,7 @@ def _parallel_build_trees_with_sepdata(
     )
     return tree
 
+
 def _parallel_build_trees_and_compute_posteriors(
     forest: BaseForest,
     idx: int,
@@ -798,6 +799,11 @@ def _statistic(
             #     )
             # )
 
+            if self.permute_per_tree and covariate_index is not None:
+                random_states = [tree.random_state for tree in estimator.estimators_]
+            else:
+                random_states = [estimator.random_state] * len(estimator.estimators_)
+
             trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
                 delayed(_parallel_build_trees_with_sepdata)(
                     estimator.estimators_[idx],
@@ -818,7 +824,7 @@ def _statistic(
             # fitting a forest will only get one unique train/test split
             indices_train, indices_test = self.train_test_samples_[0]
 
-            X_train, X_test = X[indices_train, :], X[indices_test, :]
+            X_train, _ = X[indices_train, :], X[indices_test, :]
             y_train, y_test = y[indices_train, :], y[indices_test, :]
 
             if covariate_index is not None:
@@ -848,10 +854,12 @@ def _statistic(
 
         # accumulate the predictions across all trees
         all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
-                for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
-            )
-        for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())):
+            delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+            for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+        )
+        for itree, (proba, est_indices) in enumerate(
+            zip(all_proba, self._get_estimators_indices())
+        ):
             _, indices_test = est_indices
             posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
 
@@ -1044,6 +1052,10 @@ def _statistic(
             #         self._get_estimators_indices(sample_separate=True)
             #     )
             # )
+            if self.permute_per_tree and covariate_index is not None:
+                random_states = [tree.random_state for tree in estimator.estimators_]
+            else:
+                random_states = [estimator.random_state] * len(estimator.estimators_)
 
             trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
                 delayed(_parallel_build_trees_with_sepdata)(
@@ -1065,7 +1077,7 @@ def _statistic(
             # fitting a forest will only get one unique train/test split
             indices_train, indices_test = self.train_test_samples_[0]
 
-            X_train, X_test = X[indices_train, :], X[indices_test, :]
+            X_train, _ = X[indices_train, :], X[indices_test, :]
             y_train = y[indices_train, :]
 
             if covariate_index is not None:
@@ -1101,15 +1113,21 @@ def _statistic(
         # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,)
         if predict_posteriors:
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict_proba, X, indices_test)
+                delayed(_parallel_predict_proba)(
+                    estimator.estimators_[idx].predict_proba, X, indices_test
+                )
                 for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
             )
         else:
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+                delayed(_parallel_predict_proba)(
+                    estimator.estimators_[idx].predict, X, indices_test
+                )
                 for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
             )
-        for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())):
+        for itree, (proba, est_indices) in enumerate(
+            zip(all_proba, self._get_estimators_indices())
+        ):
             _, indices_test = est_indices
             posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
 

From 17b01ac92b586b3b67c25c90d025c623b439461b Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 10:14:28 -0400
Subject: [PATCH 05/26] New submodule

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/_lib/sklearn_fork                       |  2 +-
 sktree/stats/forestht.py                       |  9 +++++++--
 sktree/tree/_oblique_tree.pxd                  |  2 +-
 sktree/tree/_oblique_tree.pyx                  |  7 +++++--
 sktree/tree/tests/test_tree.py                 | 18 ++++++++++++++++++
 sktree/tree/tests/test_unsupervised_tree.py    |  2 ++
 .../tree/unsupervised/_unsup_oblique_tree.pyx  |  7 +++++--
 7 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork
index 6c7a5f44e..1adb20907 160000
--- a/sktree/_lib/sklearn_fork
+++ b/sktree/_lib/sklearn_fork
@@ -1 +1 @@
-Subproject commit 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e
+Subproject commit 1adb209077f12adac8f760196ae5260abab0cbdd
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 6d00c4118..5183d3d4f 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -20,6 +20,7 @@
     _parallel_build_trees,
 )
 from sktree.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sktree.tree._classes import DTYPE
 
 from .utils import (
     METRIC_FUNCTIONS,
@@ -315,7 +316,7 @@ def _statistic(
         raise NotImplementedError("Subclasses should implement this!")
 
     def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None):
-        X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True)
+        X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE)
         if y.ndim != 2:
             y = y.reshape(-1, 1)
 
@@ -1129,7 +1130,11 @@ def _statistic(
             zip(all_proba, self._get_estimators_indices())
         ):
             _, indices_test = est_indices
-            posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
+
+            if predict_posteriors:
+                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_classes_)
+            else:
+                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
 
         if metric == "auc":
             # at this point, posterior_final is the predicted posterior for only the positive class
diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd
index db7813946..f85af4ce1 100644
--- a/sktree/tree/_oblique_tree.pxd
+++ b/sktree/tree/_oblique_tree.pxd
@@ -42,7 +42,7 @@ cdef class ObliqueTree(Tree):
     ) noexcept nogil
     cdef void _compute_feature_importances(
         self,
-        cnp.float64_t[:] importances,
+        float64_t[:] importances,
         Node* node
     ) noexcept nogil
 
diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx
index 99a3d6fc0..559231dae 100644
--- a/sktree/tree/_oblique_tree.pyx
+++ b/sktree/tree/_oblique_tree.pyx
@@ -217,11 +217,14 @@ cdef class ObliqueTree(Tree):
         self.proj_vec_weights.resize(capacity)
         self.proj_vec_indices.resize(capacity)
 
-        # value memory is initialised to 0 to enable classifier argmax
         if capacity > self.capacity:
+            # value memory is initialised to 0 to enable classifier argmax
             memset(<void*>(self.value + self.capacity * self.value_stride), 0,
                    (capacity - self.capacity) * self.value_stride * sizeof(float64_t))
 
+            # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct)
+            memset(<void*>(self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node))
+
         # if capacity smaller than node_count, adjust the counter
         if capacity < self.node_count:
             self.node_count = capacity
@@ -286,7 +289,7 @@ cdef class ObliqueTree(Tree):
 
     cdef void _compute_feature_importances(
         self,
-        cnp.float64_t[:] importances,
+        float64_t[:] importances,
         Node* node
     ) noexcept nogil:
         """Compute feature importances from a Node in the Tree.
diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py
index 478aa6d5b..821793aa4 100644
--- a/sktree/tree/tests/test_tree.py
+++ b/sktree/tree/tests/test_tree.py
@@ -1,3 +1,5 @@
+import pickle
+
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose
@@ -526,3 +528,19 @@ def test_balance_property(criterion, Tree):
     reg = Tree(criterion=criterion)
     reg.fit(X, y)
     assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
+
+
+# TODO: this does not work properly it seems
+@pytest.mark.skip(reason="Regression in non-deterministic pickle.")
+@pytest.mark.parametrize("Tree", ALL_TREES.values())
+def test_deterministic_pickle(Tree):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27268
+    # Uninitialised memory would lead to the two pickle strings being different.
+    tree1 = Tree(random_state=0).fit(iris.data, iris.target)
+    tree2 = Tree(random_state=0).fit(iris.data, iris.target)
+
+    pickle1 = pickle.dumps(tree1)
+    pickle2 = pickle.dumps(tree2)
+
+    assert pickle1 == pickle2, f"Failed with {Tree}"
diff --git a/sktree/tree/tests/test_unsupervised_tree.py b/sktree/tree/tests/test_unsupervised_tree.py
index 2dba0fab8..c48e32ac4 100644
--- a/sktree/tree/tests/test_unsupervised_tree.py
+++ b/sktree/tree/tests/test_unsupervised_tree.py
@@ -1,3 +1,5 @@
+import pickle
+
 import numpy as np
 import pytest
 from sklearn import datasets
diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx
index 51f6e6202..66e43d3fd 100644
--- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx
+++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx
@@ -197,11 +197,14 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree):
         self.proj_vec_weights.resize(capacity)
         self.proj_vec_indices.resize(capacity)
 
-        # value memory is initialised to 0 to enable classifier argmax
         if capacity > self.capacity:
+            # value memory is initialised to 0 to enable classifier argmax
             memset(<void*>(self.value + self.capacity * self.value_stride), 0,
                    (capacity - self.capacity) * self.value_stride * sizeof(float64_t))
-
+            
+            # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct)
+            memset(<void*>(self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node))
+            
         # if capacity smaller than node_count, adjust the counter
         if capacity < self.node_count:
             self.node_count = capacity

From 16122d3dff5d7351bd8f54f92a136961b5edb796 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 10:24:04 -0400
Subject: [PATCH 06/26] Add additional pickle test

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/tree/tests/test_all_trees.py           | 13 ++++++----
 sktree/tree/tests/test_tree.py                | 24 ++++++++++++-------
 sktree/tree/tests/test_unsupervised_tree.py   |  2 --
 .../tree/unsupervised/_unsup_oblique_tree.pyx |  4 ++--
 4 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py
index a5608fda5..66a9ea307 100644
--- a/sktree/tree/tests/test_all_trees.py
+++ b/sktree/tree/tests/test_all_trees.py
@@ -1,7 +1,7 @@
 import joblib
 import numpy as np
 import pytest
-from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal
+from numpy.testing import assert_almost_equal, assert_array_equal
 from sklearn.base import is_classifier
 from sklearn.datasets import make_blobs
 from sklearn.tree._tree import TREE_LEAF
@@ -53,9 +53,14 @@ def assert_tree_equal(d, s, message):
 
     assert_almost_equal(d.impurity, s.impurity, err_msg=message + ": inequal impurity")
 
-    assert_array_almost_equal(
-        d.value[external], s.value[external], err_msg=message + ": inequal value"
-    )
+    assert_array_equal(d.value[external], s.value[external], err_msg=message + ": inequal value")
+
+    if hasattr(d, "get_projection_matrix"):
+        assert_array_equal(
+            d.get_projection_matrix(),
+            s.get_projection_matrix(),
+            err_msg=message + ": inequal projection matrix",
+        )
 
 
 X_small = np.array(
diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py
index 821793aa4..94cdfef62 100644
--- a/sktree/tree/tests/test_tree.py
+++ b/sktree/tree/tests/test_tree.py
@@ -1,4 +1,5 @@
 import pickle
+import sys
 
 import numpy as np
 import pytest
@@ -22,6 +23,8 @@
     UnsupervisedObliqueDecisionTree,
 )
 
+from .test_all_trees import assert_tree_equal
+
 CLUSTER_CRITERIONS = ("twomeans", "fastbic")
 REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson")
 CLF_CRITERIONS = ("gini", "entropy")
@@ -143,12 +146,6 @@
 digits.target = digits.target[perm]
 
 
-def assert_tree_equal(d, s, message):
-    assert s.node_count == d.node_count, "{0}: inequal number of node ({1} != {2})".format(
-        message, s.node_count, d.node_count
-    )
-
-
 def test_pickle_splitters():
     """Test that splitters are picklable."""
     import tempfile
@@ -530,8 +527,6 @@ def test_balance_property(criterion, Tree):
     assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
 
 
-# TODO: this does not work properly it seems
-@pytest.mark.skip(reason="Regression in non-deterministic pickle.")
 @pytest.mark.parametrize("Tree", ALL_TREES.values())
 def test_deterministic_pickle(Tree):
     # Non-regression test for:
@@ -543,4 +538,15 @@ def test_deterministic_pickle(Tree):
     pickle1 = pickle.dumps(tree1)
     pickle2 = pickle.dumps(tree2)
 
-    assert pickle1 == pickle2, f"Failed with {Tree}"
+    pickle1_tree = pickle.loads(pickle1)
+    pickle2_tree = pickle.loads(pickle2)
+    assert_tree_equal(
+        pickle1_tree.tree_,
+        pickle2_tree.tree_,
+        "The trees of the original and loaded classifiers are not equal.",
+    )
+    assert sys.getsizeof(pickle1_tree) == sys.getsizeof(pickle2_tree)
+
+    # TODO: this does not work properly it seems
+    if Tree not in (list(CLF_TREES.values()) + list(REG_TREES.values())):
+        assert pickle1 == pickle2, f"Failed with {Tree}"
diff --git a/sktree/tree/tests/test_unsupervised_tree.py b/sktree/tree/tests/test_unsupervised_tree.py
index c48e32ac4..2dba0fab8 100644
--- a/sktree/tree/tests/test_unsupervised_tree.py
+++ b/sktree/tree/tests/test_unsupervised_tree.py
@@ -1,5 +1,3 @@
-import pickle
-
 import numpy as np
 import pytest
 from sklearn import datasets
diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx
index 66e43d3fd..e72802f66 100644
--- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx
+++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx
@@ -201,10 +201,10 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree):
             # value memory is initialised to 0 to enable classifier argmax
             memset(<void*>(self.value + self.capacity * self.value_stride), 0,
                    (capacity - self.capacity) * self.value_stride * sizeof(float64_t))
-            
+
             # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct)
             memset(<void*>(self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node))
-            
+
         # if capacity smaller than node_count, adjust the counter
         if capacity < self.node_count:
             self.node_count = capacity

From 7d42ac7c3335fda615e01f019414ebddb5666cea Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 10:26:25 -0400
Subject: [PATCH 07/26] Add changelog

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 doc/whats_new/v0.3.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst
index fec97bb01..6eef2ef89 100644
--- a/doc/whats_new/v0.3.rst
+++ b/doc/whats_new/v0.3.rst
@@ -15,6 +15,7 @@ Changelog
 - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`)
 - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`)
 - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`)
+- |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
 
 Code and Documentation Contributors
 -----------------------------------

From 4423377c89f631b36520b9fcfd2b36de2f336d16 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 10:52:28 -0400
Subject: [PATCH 08/26] Remove unnecessary comments

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/tree/_oblique_tree.pyx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx
index 559231dae..ecd257862 100644
--- a/sktree/tree/_oblique_tree.pyx
+++ b/sktree/tree/_oblique_tree.pyx
@@ -272,10 +272,7 @@ cdef class ObliqueTree(Tree):
         # get the index of the node
         cdef intp_t node_id = node - self.nodes
 
-        # cdef intp_t n_projections = proj_vec_indices.size()
         # compute projection of the data based on trained tree
-        # proj_vec_weights = self.proj_vec_weights[node_id]
-        # proj_vec_indices = self.proj_vec_indices[node_id]
         for j in range(0, self.proj_vec_indices[node_id].size()):
             feature_index = self.proj_vec_indices[node_id][j]
             weight = self.proj_vec_weights[node_id][j]

From 5730b326ff1f61d5421f0b4b8159cff80efd8895 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 14:36:12 -0400
Subject: [PATCH 09/26] Remove extra LOC

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 76 ----------------------------------------
 1 file changed, 76 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 5183d3d4f..2492a9e92 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -101,82 +101,6 @@ def _parallel_build_trees_with_sepdata(
     return tree
 
 
-def _parallel_build_trees_and_compute_posteriors(
-    forest: BaseForest,
-    idx: int,
-    indices_train: ArrayLike,
-    indices_test: ArrayLike,
-    X: ArrayLike,
-    y: ArrayLike,
-    covariate_index,
-    posterior_arr: ArrayLike,
-    predict_posteriors: bool,
-    permute_per_tree: bool,
-    type_of_target,
-    sample_weight: ArrayLike = None,
-    class_weight=None,
-    missing_values_in_feature_mask=None,
-    classes=None,
-):
-    """Parallel function to build trees and compute posteriors.
-
-    This inherently assumes that the caller function defines the indices
-    for the training and testing data for each tree.
-    """
-    tree: Union[DecisionTreeClassifier, DecisionTreeRegressor] = forest.estimators_[idx]
-    if permute_per_tree and covariate_index is not None:
-        random_state = tree.random_state
-    else:
-        random_state = forest.random_state
-
-    X_train = X[indices_train, :]
-    y_train = y[indices_train, ...]
-    rng = np.random.default_rng(random_state)
-
-    if forest.bootstrap:
-        n_samples_bootstrap = _get_n_samples_bootstrap(
-            n_samples=X_train.shape[0], max_samples=forest.max_samples
-        )
-    else:
-        n_samples_bootstrap = None
-
-    # individual tree permutation of y labels
-    if covariate_index is not None:
-        indices = np.arange(X_train.shape[0], dtype=int)
-        # perform permutation of covariates
-        index_arr = rng.choice(indices, size=(X_train.shape[0], 1), replace=False, shuffle=True)
-        perm_X_cov = X_train[index_arr, covariate_index]
-        X_train[:, covariate_index] = perm_X_cov
-
-    if type_of_target == "binary":
-        y_train = y_train.ravel()
-
-    tree = _parallel_build_trees(
-        tree,
-        forest.bootstrap,
-        X_train,
-        y_train,
-        sample_weight,
-        idx,
-        len(forest.estimators_),
-        verbose=0,
-        class_weight=class_weight,
-        n_samples_bootstrap=n_samples_bootstrap,
-        missing_values_in_feature_mask=missing_values_in_feature_mask,
-        classes=classes,
-    )
-
-    if predict_posteriors:
-        # XXX: currently assumes n_outputs_ == 1
-        y_pred = tree.predict_proba(X[indices_test, :]).reshape(-1, tree.n_classes_)
-    else:
-        y_pred = tree.predict(X[indices_test, :]).reshape(-1, tree.n_outputs_)
-
-    # Fill test set posteriors & set rest NaN
-    posterior_arr[idx, indices_test, :] = y_pred  # posterior
-
-
-# TODO: add support permute_per_forest_fraction option
 class BaseForestHT(MetaEstimatorMixin):
     observe_samples_: ArrayLike
     observe_posteriors_: ArrayLike

From 6f978cba406548d0485e6687da2d32e0973448ac Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 16:43:48 -0400
Subject: [PATCH 10/26] Fix pvalue

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 .../plot_MI_gigantic_hypothesis_testing_forest.py    |  6 +++---
 sktree/stats/forestht.py                             | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py
index 423bc63dc..71db71b5c 100644
--- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py
+++ b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py
@@ -108,7 +108,7 @@
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=True,
+    permute_per_tree=False,
     sample_dataset_per_tree=False,
 )
 
@@ -120,7 +120,7 @@
 stat, pvalue = est.test(
     X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
 )
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
+print(f"Estimated MI difference for the important feature set: {stat} with Pvalue: {pvalue}")
 
 # we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
 stat, pvalue = est.test(
@@ -130,7 +130,7 @@
     metric="mi",
     n_repeats=n_repeats,
 )
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
+print(f"Estimated MI difference for the unimportant feature set: {stat} with Pvalue: {pvalue}")
 
 # %%
 # References
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 2492a9e92..f7ec46ad5 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -115,8 +115,8 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        permute_per_tree=True,
-        sample_dataset_per_tree=True,
+        permute_per_tree=False,
+        sample_dataset_per_tree=False,
     ):
         self.estimator = estimator
         self.random_state = random_state
@@ -617,8 +617,8 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        permute_per_tree=True,
-        sample_dataset_per_tree=True,
+        permute_per_tree=False,
+        sample_dataset_per_tree=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -903,8 +903,8 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        permute_per_tree=True,
-        sample_dataset_per_tree=True,
+        permute_per_tree=False,
+        sample_dataset_per_tree=False,
     ):
         super().__init__(
             estimator=estimator,

From 58d5365c896396525aaca95936e54fadd166303c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 16:45:06 -0400
Subject: [PATCH 11/26] Lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index f7ec46ad5..9f0edcb32 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -11,7 +11,6 @@
 from sklearn.utils.validation import _is_fitted, check_X_y
 
 from sktree._lib.sklearn.ensemble._forest import (
-    BaseForest,
     ForestClassifier,
     ForestRegressor,
     RandomForestClassifier,

From f5f282aca67eec88ccb559b83dcae3455319e544 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 21:21:35 -0400
Subject: [PATCH 12/26] STart work on permute fraction of forest

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 9f0edcb32..c08259593 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -116,6 +116,7 @@ def __init__(
         test_size=0.2,
         permute_per_tree=False,
         sample_dataset_per_tree=False,
+        permute_forest_fraction=None,
     ):
         self.estimator = estimator
         self.random_state = random_state
@@ -123,7 +124,8 @@ def __init__(
         self.test_size = test_size
         self.permute_per_tree = permute_per_tree
         self.sample_dataset_per_tree = sample_dataset_per_tree
-
+        self.permute_forest_fraction = permute_forest_fraction
+        
         self.n_samples_test_ = None
         self._n_samples_ = None
         self._metric = None
@@ -618,6 +620,7 @@ def __init__(
         test_size=0.2,
         permute_per_tree=False,
         sample_dataset_per_tree=False,
+        permute_forest_fraction=None,
     ):
         super().__init__(
             estimator=estimator,
@@ -626,6 +629,7 @@ def __init__(
             test_size=test_size,
             permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
+            permute_forest_fraction=permute_forest_fraction
         )
 
     def _get_estimator(self):
@@ -904,6 +908,7 @@ def __init__(
         test_size=0.2,
         permute_per_tree=False,
         sample_dataset_per_tree=False,
+        permute_forest_fraction=None,
     ):
         super().__init__(
             estimator=estimator,
@@ -912,6 +917,7 @@ def __init__(
             test_size=test_size,
             permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
+            permute_forest_fraction=permute_forest_fraction
         )
 
     def _get_estimator(self):

From 3674cc20809d30566eefb915ed7d39b67684cf13 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Mon, 23 Oct 2023 13:32:07 -0400
Subject: [PATCH 13/26] FIX add stratifi

---
 sktree/stats/forestht.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index ab29afdd9..5ee738327 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -1064,17 +1064,17 @@ def _statistic(
                 delayed(_parallel_predict_proba)(
                     estimator.estimators_[idx].predict_proba, X, indices_test
                 )
-                for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+                for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y))
             )
         else:
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
                 delayed(_parallel_predict_proba)(
                     estimator.estimators_[idx].predict, X, indices_test
                 )
-                for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+                for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y))
             )
         for itree, (proba, est_indices) in enumerate(
-            zip(all_proba, self._get_estimators_indices())
+            zip(all_proba, self._get_estimators_indices(y))
         ):
             _, indices_test = est_indices
 

From 0f27d018fbc3786a4f23adf5c0931ef4b01faa47 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 24 Oct 2023 10:37:56 -0400
Subject: [PATCH 14/26] Try stash

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py            | 25 +++++++++++++------------
 sktree/stats/tests/test_forestht.py |  1 -
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 5ee738327..08d3908d5 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -177,13 +177,13 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False):
                         self._seeds.append(tree.random_state)
             seeds = self._seeds
 
-            if sample_separate:
-                if self._perm_seeds is None:
-                    new_rng = np.random.default_rng(np.random.randint(0, 1e6))
-                    self._perm_seeds = new_rng.integers(
-                        low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_)
-                    )
-                seeds = self._perm_seeds
+            # if sample_separate:
+            #     if self._perm_seeds is None:
+            #         new_rng = np.random.default_rng(np.random.randint(0, 1e6))
+            #         self._perm_seeds = new_rng.integers(
+            #             low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_)
+            #         )
+            #     seeds = self._perm_seeds
             for idx, tree in enumerate(self.estimator_.estimators_):
                 seed = seeds[idx]
 
@@ -207,6 +207,7 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False):
 
             indices_train, indices_test = train_test_split(
                 indices,
+                shuffle=True,
                 test_size=self.test_size,
                 stratify=stratifier,
                 random_state=self._seeds,
@@ -796,10 +797,10 @@ def _statistic(
         # accumulate the predictions across all trees
         all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
             delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
-            for idx, (_, indices_test) in enumerate(self._get_estimators_indices())
+            for idx, (_, indices_test) in enumerate(self.train_test_samples_)
         )
         for itree, (proba, est_indices) in enumerate(
-            zip(all_proba, self._get_estimators_indices())
+            zip(all_proba, self.train_test_samples_)
         ):
             _, indices_test = est_indices
             posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
@@ -1064,17 +1065,17 @@ def _statistic(
                 delayed(_parallel_predict_proba)(
                     estimator.estimators_[idx].predict_proba, X, indices_test
                 )
-                for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y))
+                for idx, (_, indices_test) in enumerate(self.train_test_samples_)
             )
         else:
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
                 delayed(_parallel_predict_proba)(
                     estimator.estimators_[idx].predict, X, indices_test
                 )
-                for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y))
+                for idx, (_, indices_test) in enumerate(self.train_test_samples_)
             )
         for itree, (proba, est_indices) in enumerate(
-            zip(all_proba, self._get_estimators_indices(y))
+            zip(all_proba, self.train_test_samples_)
         ):
             _, indices_test = est_indices
 
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index e71e5e09b..d5eade80d 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -475,7 +475,6 @@ def test_small_dataset_dependent(seed):
     y = np.vstack(
         [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
     )  # Binary classification
-    print(X.shape, y.shape)
 
     clf = FeatureImportanceForestClassifier(
         estimator=HonestForestClassifier(

From e8947088dac06fcbb3ee9fe13b3b7b0d0e01e576 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 24 Oct 2023 11:33:52 -0400
Subject: [PATCH 15/26] UPdate and address permute forest fraction

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py            | 62 +++++++++++--------
 sktree/stats/tests/test_forestht.py | 93 ++++++++++++++++-------------
 sktree/tree/_honest_tree.py         |  1 +
 3 files changed, 89 insertions(+), 67 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 08d3908d5..847557ac0 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -114,19 +114,19 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        permute_per_tree=False,
-        sample_dataset_per_tree=False,
         stratify=False,
+        sample_dataset_per_tree=False,
         permute_forest_fraction=None,
     ):
         self.estimator = estimator
         self.random_state = random_state
         self.verbose = verbose
         self.test_size = test_size
-        self.permute_per_tree = permute_per_tree
+        self.stratify = stratify
+
+        # XXX: possibly removing these parameters
         self.sample_dataset_per_tree = sample_dataset_per_tree
         self.permute_forest_fraction = permute_forest_fraction
-        self.stratify = stratify
 
         self.n_samples_test_ = None
         self._n_samples_ = None
@@ -166,15 +166,33 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False):
         # Get drawn indices along both sample and feature axes
         rng = np.random.default_rng(self.estimator_.random_state)
 
-        if self.sample_dataset_per_tree:
+        if self.permute_forest_fraction is None:
+            permute_forest_fraction = 0.0
+        else:
+            permute_forest_fraction = self.permute_forest_fraction
+
+        # TODO: consolidate how we "sample/permute" per subset of the forest
+        if self.sample_dataset_per_tree or permute_forest_fraction > 0.0:
+            # sample random seeds
             if self._seeds is None:
                 self._seeds = []
+                self._n_permutations = 0
 
-                for tree in self.estimator_.estimators_:
-                    if tree.random_state is None:
-                        self._seeds.append(rng.integers(low=0, high=np.iinfo(np.int32).max))
-                    else:
-                        self._seeds.append(tree.random_state)
+                num_trees_per_seed = max(
+                    int(permute_forest_fraction * len(self.estimator_.estimators_)), 1
+                )
+                for tree_idx, tree in enumerate(self.estimator_.estimators_):
+                    if tree_idx == 0 or tree_idx % num_trees_per_seed == 0:
+                        if tree.random_state is None:
+                            seed = rng.integers(low=0, high=np.iinfo(np.int32).max)
+                        else:
+                            seed = tree.random_state
+
+                        self._n_permutations += 1
+                    self._seeds.append(seed)
+
+            # now that we have the random seeds, we can sample the train/test indices
+            # deterministically
             seeds = self._seeds
 
             # if sample_separate:
@@ -632,7 +650,7 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        permute_per_tree=False,
+        # permute_per_tree=False,
         sample_dataset_per_tree=False,
         permute_forest_fraction=None,
     ):
@@ -641,7 +659,7 @@ def __init__(
             random_state=random_state,
             verbose=verbose,
             test_size=test_size,
-            permute_per_tree=permute_per_tree,
+            # permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
             permute_forest_fraction=permute_forest_fraction,
         )
@@ -721,7 +739,7 @@ def _statistic(
 
         # both sampling dataset per tree or permuting per tree requires us to bypass the
         # sklearn API to fit each tree individually
-        if self.sample_dataset_per_tree or self.permute_per_tree:
+        if self.sample_dataset_per_tree or self.permute_forest_fraction:
             # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
             #     delayed(_parallel_build_trees_and_compute_posteriors)(
             #         estimator,
@@ -741,7 +759,7 @@ def _statistic(
             #     )
             # )
 
-            if self.permute_per_tree and covariate_index is not None:
+            if self.permute_forest_fraction and covariate_index is not None:
                 random_states = [tree.random_state for tree in estimator.estimators_]
             else:
                 random_states = [estimator.random_state] * len(estimator.estimators_)
@@ -799,9 +817,7 @@ def _statistic(
             delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
             for idx, (_, indices_test) in enumerate(self.train_test_samples_)
         )
-        for itree, (proba, est_indices) in enumerate(
-            zip(all_proba, self.train_test_samples_)
-        ):
+        for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
             _, indices_test = est_indices
             posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
 
@@ -923,7 +939,7 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        permute_per_tree=False,
+        # permute_per_tree=False,
         sample_dataset_per_tree=False,
         stratify=True,
         permute_forest_fraction=None,
@@ -933,7 +949,7 @@ def __init__(
             random_state=random_state,
             verbose=verbose,
             test_size=test_size,
-            permute_per_tree=permute_per_tree,
+            # permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
             stratify=stratify,
             permute_forest_fraction=permute_forest_fraction,
@@ -982,7 +998,7 @@ def _statistic(
 
         # both sampling dataset per tree or permuting per tree requires us to bypass the
         # sklearn API to fit each tree individually
-        if self.sample_dataset_per_tree or self.permute_per_tree:
+        if self.sample_dataset_per_tree or self.permute_forest_fraction:
             # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
             #     delayed(_parallel_build_trees_and_compute_posteriors)(
             #         estimator,
@@ -1001,7 +1017,7 @@ def _statistic(
             #         self._get_estimators_indices(sample_separate=True)
             #     )
             # )
-            if self.permute_per_tree and covariate_index is not None:
+            if self.permute_forest_fraction and covariate_index is not None:
                 random_states = [tree.random_state for tree in estimator.estimators_]
             else:
                 random_states = [estimator.random_state] * len(estimator.estimators_)
@@ -1074,9 +1090,7 @@ def _statistic(
                 )
                 for idx, (_, indices_test) in enumerate(self.train_test_samples_)
             )
-        for itree, (proba, est_indices) in enumerate(
-            zip(all_proba, self.train_test_samples_)
-        ):
+        for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
             _, indices_test = est_indices
 
             if predict_posteriors:
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index d5eade80d..6c6668563 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -33,17 +33,18 @@
 
 @pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
 def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
+    n_samples = 50
     est = FeatureImportanceForestClassifier(
         estimator=RandomForestClassifier(
             n_estimators=10,
             random_state=seed,
         ),
-        permute_per_tree=True,
+        # permute_per_tree=True,
+        permute_forest_fraction=1.0 / n_samples,
         test_size=0.7,
         random_state=seed,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
-    n_samples = 50
     est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse")
 
     assert (
@@ -71,17 +72,18 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
 
 @pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
 def test_featureimportance_forest_stratified(sample_dataset_per_tree):
+    n_samples = 100
     est = FeatureImportanceForestClassifier(
         estimator=RandomForestClassifier(
             n_estimators=10,
             random_state=seed,
         ),
-        permute_per_tree=True,
+        # permute_per_tree=True,
+        permute_forest_fraction=1.0 / n_samples,
         test_size=0.7,
         random_state=seed,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
-    n_samples = 100
     est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")
 
     _, indices_test = est.train_test_samples_[0]
@@ -102,14 +104,14 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree):
 
 
 def test_featureimportance_forest_errors():
-    permute_per_tree = False
+    # permute_per_tree = False
     sample_dataset_per_tree = True
     est = FeatureImportanceForestClassifier(
         estimator=RandomForestClassifier(
             n_estimators=10,
         ),
         test_size=0.5,
-        permute_per_tree=permute_per_tree,
+        permute_forest_fraction=None,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
     with pytest.raises(RuntimeError, match="The estimator must be fitted"):
@@ -139,15 +141,15 @@ def test_featureimportance_forest_errors():
     ],
 )
 @pytest.mark.parametrize(
-    "permute_per_tree",
+    "permute_forest_fraction",
     [
-        True,
-        False,
+        None,
+        0.5,
     ],
 )
 @pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
 def test_iris_pauc_statistic(
-    criterion, honest_prior, estimator, permute_per_tree, sample_dataset_per_tree
+    criterion, honest_prior, estimator, permute_forest_fraction, sample_dataset_per_tree
 ):
     limit = 0.1
     max_features = "sqrt"
@@ -168,14 +170,14 @@ def test_iris_pauc_statistic(
         ),
         test_size=test_size,
         sample_dataset_per_tree=sample_dataset_per_tree,
-        permute_per_tree=permute_per_tree,
+        permute_forest_fraction=permute_forest_fraction,
     )
     # now add completely uninformative feature
     X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4))))
 
     # test for unimportant feature set
     clf.reset()
-    if sample_dataset_per_tree and not permute_per_tree:
+    if sample_dataset_per_tree and permute_forest_fraction is None:
         # test in another test
         return
 
@@ -219,7 +221,8 @@ def test_iris_pauc_statistic(
                 n_estimators=10,
             ),
             random_state=seed,
-            permute_per_tree=False,
+            # permute_per_tree=False,
+            permute_forest_fraction=None,
             sample_dataset_per_tree=False,
         ),
     ],
@@ -323,19 +326,33 @@ def test_pickle(tmpdir):
         assert_array_equal(getattr(clf, attr), getattr(clf_pickle, attr))
 
 
-@pytest.mark.parametrize("permute_per_tree", [True, False], ids=["permute_per_tree", "no_permute"])
+@pytest.mark.parametrize(
+    "permute_forest_fraction",
+    [
+        None,
+        #  0.5
+    ],
+    ids=[
+        "no_permute"
+        #  "permute_forest_fraction",
+    ],
+)
 @pytest.mark.parametrize(
     "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"]
 )
-def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_dataset_per_tree):
+def test_sample_size_consistency_of_estimator_indices_(
+    permute_forest_fraction, sample_dataset_per_tree
+):
     """Test that the test-sample indices are what is expected."""
     clf = FeatureImportanceForestClassifier(
         estimator=HonestForestClassifier(
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
-        permute_per_tree=permute_per_tree,
+        # permute_per_tree=permute_per_tree,
+        permute_forest_fraction=permute_forest_fraction,
         sample_dataset_per_tree=sample_dataset_per_tree,
+        stratify=False,
     )
 
     n_samples = 100
@@ -346,6 +363,7 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_
     _, posteriors, samples = clf.statistic(
         X, y, covariate_index=None, return_posteriors=True, metric="mi"
     )
+    print(clf._seeds)
     if sample_dataset_per_tree:
         # check the non-nans
         non_nan_idx = _non_nan_samples(posteriors)
@@ -367,7 +385,11 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_
                 f"{set(sorted_est_samples_idx) - set(sorted_sample_idx)}",
             )
     else:
-        assert_array_equal(samples, sorted(clf.train_test_samples_[0][1]))
+        assert_array_equal(
+            samples,
+            sorted(clf.train_test_samples_[0][1]),
+            err_msg=f"Samples {set(samples) - set(sorted(clf.train_test_samples_[0][1]))}.",
+        )
     assert len(_non_nan_samples(posteriors)) == len(samples)
 
 
@@ -384,7 +406,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
-        permute_per_tree=True,
+        # permute_per_tree=True,
+        permute_forest_fraction=1.0 / n_samples,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
     other_clf = FeatureImportanceForestClassifier(
@@ -392,7 +415,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
-        permute_per_tree=False,
+        # permute_per_tree=False,
+        permute_forest_fraction=None,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
 
@@ -405,8 +429,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
         assert_array_equal(clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0])
         assert_array_equal(clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1])
 
-    # if the sample_dataset_per_tree, then the indices should be different across all
-    if sample_dataset_per_tree:
+    # if the sample_dataset_per_tree, then the indices should be different across all trees
+    if sample_dataset_per_tree or clf.permute_forest_fraction > 0.0:
         for indices, other_indices in combinations(clf.train_test_samples_, 2):
             assert not np.array_equal(indices[0], other_indices[0])
             assert not np.array_equal(indices[1], other_indices[1])
@@ -426,7 +450,7 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
         )
 
         # when seed is passed, the indices should be deterministic
-        if seed is not None:
+        if seed is not None and sample_dataset_per_tree:
             assert_array_equal(
                 clf.train_test_samples_[idx][0], other_clf.train_test_samples_[idx][0]
             )
@@ -448,7 +472,8 @@ def test_small_dataset_independent(seed):
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5
         ),
         test_size=0.2,
-        permute_per_tree=False,
+        # permute_per_tree=False,
+        permute_forest_fraction=None,
         sample_dataset_per_tree=False,
     )
     stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
@@ -481,7 +506,8 @@ def test_small_dataset_dependent(seed):
             n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5
         ),
         test_size=0.2,
-        permute_per_tree=False,
+        # permute_per_tree=False,
+        permute_forest_fraction=None,
         sample_dataset_per_tree=False,
     )
     stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
@@ -491,22 +517,3 @@ def test_small_dataset_dependent(seed):
 
     stat, pvalue = clf.test(X, y, metric="mi")
     assert pvalue <= 0.05
-
-
-# @pytest.mark.monitor_test
-# def test_memory_usage():
-#     n_samples = 1000
-#     n_features = 5000
-#     X = rng.uniform(size=(n_samples, n_features))
-#     y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
-#     clf = FeatureImportanceForestClassifier(
-#         estimator=HonestForestClassifier(
-#             n_estimators=10, random_state=seed, n_jobs=-1, honest_fraction=0.5
-#         ),
-#         test_size=0.2,
-#         permute_per_tree=False,
-#         sample_dataset_per_tree=False,
-#     )
-
-#     stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py
index 7d534ddb9..b8bd2528b 100644
--- a/sktree/tree/_honest_tree.py
+++ b/sktree/tree/_honest_tree.py
@@ -528,6 +528,7 @@ def _fit(
 
         nonzero_indices = np.where(_sample_weight > 0)[0]
 
+        # TODO: perhaps we want to stratify this split
         self.structure_indices_ = rng.choice(
             nonzero_indices,
             int((1 - self.honest_fraction) * len(nonzero_indices)),

From 2887909692f3ef7a55bbd6f6b1ba113f110ed9d4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 24 Oct 2023 13:16:17 -0400
Subject: [PATCH 16/26] WIP

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py            | 108 +++++++++++++---------------
 sktree/stats/tests/test_forestht.py |  17 ++---
 2 files changed, 55 insertions(+), 70 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 847557ac0..c1f9026fd 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -10,6 +10,7 @@
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import _is_fitted, check_X_y
 
+from sktree import HonestForestClassifier
 from sktree._lib.sklearn.ensemble._forest import (
     ForestClassifier,
     ForestRegressor,
@@ -117,6 +118,7 @@ def __init__(
         stratify=False,
         sample_dataset_per_tree=False,
         permute_forest_fraction=None,
+        train_test_split=True,
     ):
         self.estimator = estimator
         self.random_state = random_state
@@ -124,6 +126,7 @@ def __init__(
         self.test_size = test_size
         self.stratify = stratify
 
+        self.train_test_split = train_test_split
         # XXX: possibly removing these parameters
         self.sample_dataset_per_tree = sample_dataset_per_tree
         self.permute_forest_fraction = permute_forest_fraction
@@ -250,6 +253,10 @@ def train_test_samples_(self):
         if self._n_samples_ is None:
             raise RuntimeError("The estimator must be fitted before accessing this attribute.")
 
+        # we are not train/test splitting, then 
+        if not self.train_test_split:
+            return [(np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) for _ in range(len(self.estimator_.estimators_))]
+
         # Stratifier uses a cached _y attribute if available
         stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None
 
@@ -303,6 +310,9 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike =
                 f"y must have type {self._type_of_target_}, got {type_of_target(y)}. "
                 f"If running on a new dataset, call the 'reset' method."
             )
+        
+        if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier):
+            raise RuntimeError(f'Train test split must occur if not using honest forest classifier.')
 
         return X, y, covariate_index
 
@@ -592,12 +602,16 @@ class FeatureImportanceForestRegressor(BaseForestHT):
     test_size : float, default=0.2
         Proportion of samples per tree to use for the test set.
 
-    permute_per_tree : bool, default=True
-        Whether to permute the covariate index per tree or per forest.
-
     sample_dataset_per_tree : bool, default=False
         Whether to sample the dataset per tree or per forest.
 
+    permute_forest_fraction : float, default=None
+        The fraction of trees to permute the covariate index for. If None, then
+        just one permutation is performed.
+
+    train_test_split : bool, default=True
+        Whether to split the dataset before passing to the forest.
+
     Attributes
     ----------
     estimator_ : BaseForest
@@ -653,6 +667,7 @@ def __init__(
         # permute_per_tree=False,
         sample_dataset_per_tree=False,
         permute_forest_fraction=None,
+        train_test_split=True,
     ):
         super().__init__(
             estimator=estimator,
@@ -662,6 +677,7 @@ def __init__(
             # permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
             permute_forest_fraction=permute_forest_fraction,
+            train_test_split=train_test_split,
         )
 
     def _get_estimator(self):
@@ -740,25 +756,6 @@ def _statistic(
         # both sampling dataset per tree or permuting per tree requires us to bypass the
         # sklearn API to fit each tree individually
         if self.sample_dataset_per_tree or self.permute_forest_fraction:
-            # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
-            #     delayed(_parallel_build_trees_and_compute_posteriors)(
-            #         estimator,
-            #         idx,
-            #         indices_train,
-            #         indices_test,
-            #         X,
-            #         y,
-            #         covariate_index,
-            #         posterior_arr,
-            #         False,
-            #         self.permute_per_tree,
-            #         self._type_of_target_,
-            #     )
-            #     for idx, (indices_train, indices_test) in enumerate(
-            #         self._get_estimators_indices(sample_separate=True)
-            #     )
-            # )
-
             if self.permute_forest_fraction and covariate_index is not None:
                 random_states = [tree.random_state for tree in estimator.estimators_]
             else:
@@ -802,24 +799,29 @@ def _statistic(
                 y_train = y_train.ravel()
             estimator.fit(X_train, y_train)
 
-            # construct posterior array for all trees (n_trees, n_samples_test, n_outputs)
-            # for itree, tree in enumerate(estimator.estimators_):
-            #     posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape(
-            #         -1, tree.n_outputs_
-            #     )
-
             # set variables to compute metric
             samples = indices_test
             y_true_final = y_test
 
-        # accumulate the predictions across all trees
-        all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-            delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
-            for idx, (_, indices_test) in enumerate(self.train_test_samples_)
-        )
-        for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
-            _, indices_test = est_indices
-            posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
+        # TODO: probably a more elegant way of doing this
+        if self.train_test_split:
+            # accumulate the predictions across all trees
+            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+                for idx, (_, indices_test) in enumerate(self.train_test_samples_)
+            )
+            for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
+                _, indices_test = est_indices
+                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
+        else:
+            # accumulate the predictions across all trees
+            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+                for idx, honest_tree in enumerate(estimator.estimators_)
+            )
+            for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
+                _, indices_test = est_indices
+                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
 
         # determine if there are any nans in the final posterior array, when
         # averaged over the trees
@@ -880,14 +882,18 @@ class FeatureImportanceForestClassifier(BaseForestHT):
     test_size : float, default=0.2
         Proportion of samples per tree to use for the test set.
 
-    permute_per_tree : bool, default=True
-        Whether to permute the covariate index per tree or per forest.
+    stratify : bool, default=True
+        Whether to stratify the samples by class labels.
 
     sample_dataset_per_tree : bool, default=False
         Whether to sample the dataset per tree or per forest.
 
-    stratify : bool, default=True
-        Whether to stratify the samples by class labels.
+    permute_forest_fraction : float, default=None
+        The fraction of trees to permute the covariate index for. If None, then
+        just one permutation is performed.
+
+    train_test_split : bool, default=True
+        Whether to split the data into train/test before passing to the forest.
 
     Attributes
     ----------
@@ -940,9 +946,10 @@ def __init__(
         verbose=0,
         test_size=0.2,
         # permute_per_tree=False,
-        sample_dataset_per_tree=False,
         stratify=True,
+        sample_dataset_per_tree=False,
         permute_forest_fraction=None,
+        train_test_split=True,
     ):
         super().__init__(
             estimator=estimator,
@@ -952,6 +959,7 @@ def __init__(
             # permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
             stratify=stratify,
+            train_test_split=train_test_split,
             permute_forest_fraction=permute_forest_fraction,
         )
 
@@ -999,24 +1007,6 @@ def _statistic(
         # both sampling dataset per tree or permuting per tree requires us to bypass the
         # sklearn API to fit each tree individually
         if self.sample_dataset_per_tree or self.permute_forest_fraction:
-            # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
-            #     delayed(_parallel_build_trees_and_compute_posteriors)(
-            #         estimator,
-            #         idx,
-            #         indices_train,
-            #         indices_test,
-            #         X,
-            #         y,
-            #         covariate_index,
-            #         posterior_arr,
-            #         predict_posteriors,
-            #         self.permute_per_tree,
-            #         self._type_of_target_,
-            #     )
-            #     for idx, (indices_train, indices_test) in enumerate(
-            #         self._get_estimators_indices(sample_separate=True)
-            #     )
-            # )
             if self.permute_forest_fraction and covariate_index is not None:
                 random_states = [tree.random_state for tree in estimator.estimators_]
             else:
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index 6c6668563..4343a27af 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -328,14 +328,8 @@ def test_pickle(tmpdir):
 
 @pytest.mark.parametrize(
     "permute_forest_fraction",
-    [
-        None,
-        #  0.5
-    ],
-    ids=[
-        "no_permute"
-        #  "permute_forest_fraction",
-    ],
+    [None, 0.5],
+    ids=["no_permute", "permute_forest_fraction"],
 )
 @pytest.mark.parametrize(
     "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"]
@@ -363,11 +357,12 @@ def test_sample_size_consistency_of_estimator_indices_(
     _, posteriors, samples = clf.statistic(
         X, y, covariate_index=None, return_posteriors=True, metric="mi"
     )
-    print(clf._seeds)
-    if sample_dataset_per_tree:
+
+    if sample_dataset_per_tree or permute_forest_fraction is not None:
         # check the non-nans
         non_nan_idx = _non_nan_samples(posteriors)
-        assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}"
+        if sample_dataset_per_tree:
+            assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}"
 
         sorted_sample_idx = sorted(np.unique(samples))
         sorted_est_samples_idx = sorted(

From 739c7bef58586b73f43fae52bff30d611a375f29 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 24 Oct 2023 17:35:38 -0400
Subject: [PATCH 17/26] Adding ability to turn off train/test split

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py            | 79 ++++++++++++++++++-----------
 sktree/stats/tests/test_forestht.py | 54 ++++++++++++++++++++
 2 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index c1f9026fd..7f35ff9f4 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -10,7 +10,6 @@
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import _is_fitted, check_X_y
 
-from sktree import HonestForestClassifier
 from sktree._lib.sklearn.ensemble._forest import (
     ForestClassifier,
     ForestRegressor,
@@ -19,6 +18,7 @@
     _get_n_samples_bootstrap,
     _parallel_build_trees,
 )
+from sktree.ensemble._honest_forest import HonestForestClassifier
 from sktree.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sktree.tree._classes import DTYPE
 
@@ -253,9 +253,12 @@ def train_test_samples_(self):
         if self._n_samples_ is None:
             raise RuntimeError("The estimator must be fitted before accessing this attribute.")
 
-        # we are not train/test splitting, then 
+        # we are not train/test splitting, then
         if not self.train_test_split:
-            return [(np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) for _ in range(len(self.estimator_.estimators_))]
+            return [
+                (np.arange(self._n_samples_, dtype=int), np.array([], dtype=int))
+                for _ in range(len(self.estimator_.estimators_))
+            ]
 
         # Stratifier uses a cached _y attribute if available
         stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None
@@ -310,9 +313,11 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike =
                 f"y must have type {self._type_of_target_}, got {type_of_target(y)}. "
                 f"If running on a new dataset, call the 'reset' method."
             )
-        
+
         if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier):
-            raise RuntimeError(f'Train test split must occur if not using honest forest classifier.')
+            raise RuntimeError(
+                "Train test split must occur if not using honest forest classifier."
+            )
 
         return X, y, covariate_index
 
@@ -807,21 +812,24 @@ def _statistic(
         if self.train_test_split:
             # accumulate the predictions across all trees
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
+                delayed(_parallel_predict_proba)(
+                    estimator.estimators_[idx].predict, X, indices_test
+                )
                 for idx, (_, indices_test) in enumerate(self.train_test_samples_)
             )
             for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
                 _, indices_test = est_indices
                 posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
         else:
+            all_indices = np.arange(self._n_samples_, dtype=int)
+
             # accumulate the predictions across all trees
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test)
-                for idx, honest_tree in enumerate(estimator.estimators_)
+                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, all_indices)
+                for idx in range(len(estimator.estimators_))
             )
-            for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
-                _, indices_test = est_indices
-                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
+            for itree, proba in enumerate(all_proba):
+                posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_outputs_)
 
         # determine if there are any nans in the final posterior array, when
         # averaged over the trees
@@ -1050,29 +1058,37 @@ def _statistic(
                 y_train = y_train.ravel()
             estimator.fit(X_train, y_train)
 
-            # construct posterior array for all trees (n_trees, n_samples_test, n_outputs)
-            # for itree, tree in enumerate(estimator.estimators_):
-            #     if predict_posteriors:
-            #         # XXX: currently assumes n_outputs_ == 1
-            #         posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape(
-            #             -1, tree.n_classes_
-            #         )
-            #     else:
-            #         posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape(
-            #             -1, tree.n_outputs_
-            #         )
-
             # set variables to compute metric
             samples = indices_test
 
         # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,)
         if predict_posteriors:
-            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(
-                    estimator.estimators_[idx].predict_proba, X, indices_test
+            # all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+            #     delayed(_parallel_predict_proba)(
+            #         estimator.estimators_[idx].predict_proba, X, indices_test
+            #     )
+            #     for idx, (_, indices_test) in enumerate(self.train_test_samples_)
+            # )
+
+            # TODO: probably a more elegant way of doing this
+            if self.train_test_split:
+                # accumulate the predictions across all trees
+                all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                    delayed(_parallel_predict_proba)(
+                        estimator.estimators_[idx].predict_proba, X, indices_test
+                    )
+                    for idx, (_, indices_test) in enumerate(self.train_test_samples_)
+                )
+            else:
+                all_indices = np.arange(self._n_samples_, dtype=int)
+
+                # accumulate the predictions across all trees
+                all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
+                    delayed(_parallel_predict_proba)(
+                        estimator.estimators_[idx].predict_proba, X, all_indices
+                    )
+                    for idx in range(len(estimator.estimators_))
                 )
-                for idx, (_, indices_test) in enumerate(self.train_test_samples_)
-            )
         else:
             all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
                 delayed(_parallel_predict_proba)(
@@ -1084,7 +1100,12 @@ def _statistic(
             _, indices_test = est_indices
 
             if predict_posteriors:
-                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_classes_)
+                if self.train_test_split:
+                    posterior_arr[itree, indices_test, ...] = proba.reshape(
+                        -1, estimator.n_classes_
+                    )
+                else:
+                    posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_classes_)
             else:
                 posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
 
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index 4343a27af..39d5d9468 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -512,3 +512,57 @@ def test_small_dataset_dependent(seed):
 
     stat, pvalue = clf.test(X, y, metric="mi")
     assert pvalue <= 0.05
+
+
+@flaky(max_runs=3)
+def test_no_traintest_split():
+    n_samples = 500
+    n_features = 5
+    rng = np.random.default_rng(seed)
+
+    X = rng.uniform(size=(n_samples, n_features))
+    X = rng.uniform(size=(n_samples // 2, n_features))
+    X2 = X * 2
+    X = np.vstack([X, X2])
+    y = np.vstack(
+        [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
+    )  # Binary classification
+
+    clf = FeatureImportanceForestClassifier(
+        estimator=HonestForestClassifier(
+            n_estimators=50,
+            max_features=n_features,
+            random_state=seed,
+            n_jobs=1,
+            honest_fraction=0.5,
+        ),
+        test_size=0.2,
+        train_test_split=False,
+        permute_forest_fraction=None,
+        sample_dataset_per_tree=False,
+    )
+    stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
+
+    # since no train-test split, the training is all the data and the testing is none of the data
+    assert_array_equal(clf.train_test_samples_[0][0], np.arange(n_samples))
+    assert_array_equal(clf.train_test_samples_[0][1], np.array([]))
+
+    assert ~np.isnan(pvalue)
+    assert ~np.isnan(stat)
+    assert pvalue <= 0.05, f"{pvalue}"
+
+    stat, pvalue = clf.test(X, y, metric="mi")
+    assert pvalue <= 0.05, f"{pvalue}"
+
+    X = rng.uniform(size=(n_samples, n_features))
+    y = rng.integers(0, 2, size=n_samples)  # Binary classification
+    clf.reset()
+
+    stat, pvalue = clf.test(X, y, metric="mi")
+    assert_almost_equal(stat, 0.0, decimal=1)
+    assert pvalue > 0.05, f"{pvalue}"
+
+    stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
+    assert ~np.isnan(pvalue)
+    assert ~np.isnan(stat)
+    assert pvalue > 0.05, f"{pvalue}"

From 36c5582be3e3df51bfffd3eb6c10e67cd2c5e2b2 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 8 Nov 2023 09:09:01 -0500
Subject: [PATCH 18/26] Fix type checK

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py          | 22 ++++++++++------------
 sktree/stats/permutationforest.py |  5 +++--
 sktree/stats/utils.py             |  6 +++---
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 7f35ff9f4..130739ace 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple, Union
+from typing import Optional, Callable, Tuple, Union
 
 import numpy as np
 from joblib import Parallel, delayed
@@ -54,7 +54,7 @@ def _parallel_build_trees_with_sepdata(
     covariate_index,
     bootstrap: bool,
     max_samples,
-    sample_weight: ArrayLike = None,
+    sample_weight: Optional[ArrayLike] = None,
     class_weight=None,
     missing_values_in_feature_mask=None,
     classes=None,
@@ -76,6 +76,7 @@ def _parallel_build_trees_with_sepdata(
     else:
         n_samples_bootstrap = None
 
+    # XXX: this currently creates a copy of X_train on RAM, which is not ideal
     # individual tree permutation of y labels
     if covariate_index is not None:
         indices = np.arange(X_train.shape[0], dtype=int)
@@ -280,7 +281,7 @@ def _statistic(
     ):
         raise NotImplementedError("Subclasses should implement this!")
 
-    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None):
+    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None):
         X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE)
         if y.ndim != 2:
             y = y.reshape(-1, 1)
@@ -315,9 +316,7 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike =
             )
 
         if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier):
-            raise RuntimeError(
-                "Train test split must occur if not using honest forest classifier."
-            )
+            raise RuntimeError("Train test split must occur if not using honest forest classifier.")
 
         return X, y, covariate_index
 
@@ -325,7 +324,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mi",
         return_posteriors: bool = False,
         check_input: bool = True,
@@ -444,7 +443,7 @@ def test(
         self,
         X,
         y,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric: str = "mi",
         n_repeats: int = 1000,
         return_posteriors: bool = True,
@@ -612,7 +611,8 @@ class FeatureImportanceForestRegressor(BaseForestHT):
 
     permute_forest_fraction : float, default=None
         The fraction of trees to permute the covariate index for. If None, then
-        just one permutation is performed.
+        just one permutation is performed. If sampling a permutation per tree
+        is desirable, then the fraction should be set to ``1. / n_estimators``.
 
     train_test_split : bool, default=True
         Whether to split the dataset before passing to the forest.
@@ -669,7 +669,6 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        # permute_per_tree=False,
         sample_dataset_per_tree=False,
         permute_forest_fraction=None,
         train_test_split=True,
@@ -679,7 +678,6 @@ def __init__(
             random_state=random_state,
             verbose=verbose,
             test_size=test_size,
-            # permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
             permute_forest_fraction=permute_forest_fraction,
             train_test_split=train_test_split,
@@ -698,7 +696,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         check_input: bool = True,
diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py
index 4a27f539f..78a2437a4 100644
--- a/sktree/stats/permutationforest.py
+++ b/sktree/stats/permutationforest.py
@@ -10,6 +10,7 @@
 from sktree._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier, ForestRegressor
 
 from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm
+from typing import Optional
 
 
 class BasePermutationForest(MetaEstimatorMixin):
@@ -62,7 +63,7 @@ def _statistic(
         estimator: BaseForest,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         seed=None,
@@ -117,7 +118,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         check_input: bool = True,
diff --git a/sktree/stats/utils.py b/sktree/stats/utils.py
index f21dd00a8..7ff7a9f05 100644
--- a/sktree/stats/utils.py
+++ b/sktree/stats/utils.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -112,7 +112,7 @@ def _compute_null_distribution_perm(
     est: ForestClassifier,
     metric: str = "mse",
     n_repeats: int = 1000,
-    seed: int = None,
+    seed: Optional[int] = None,
 ) -> ArrayLike:
     """Compute null distribution using permutation method.
 
@@ -173,7 +173,7 @@ def _compute_null_distribution_coleman(
     y_pred_proba_perm: ArrayLike,
     metric: str = "mse",
     n_repeats: int = 1000,
-    seed: int = None,
+    seed: Optional[int] = None,
 ) -> Tuple[ArrayLike, ArrayLike]:
     """Compute null distribution using Coleman method.
 

From 1ff7a5cb1aa6d5e5b38d2159afd7b4036c6cef85 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 8 Nov 2023 09:10:29 -0500
Subject: [PATCH 19/26] Fix typing

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/experimental/mutual_info.py | 2 +-
 sktree/stats/forestht.py           | 2 +-
 sktree/stats/permutationforest.py  | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sktree/experimental/mutual_info.py b/sktree/experimental/mutual_info.py
index a1820e715..85c8f0a83 100644
--- a/sktree/experimental/mutual_info.py
+++ b/sktree/experimental/mutual_info.py
@@ -147,7 +147,7 @@ def mutual_info_ksg(
     algorithm="kd_tree",
     n_jobs: int = -1,
     transform: str = "rank",
-    random_seed: int = None,
+    random_seed: Optional[int] = None,
 ):
     """Compute the generalized (conditional) mutual information KSG estimate.
 
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 130739ace..f0518dc5e 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable, Tuple, Union
+from typing import Callable, Optional, Tuple, Union
 
 import numpy as np
 from joblib import Parallel, delayed
diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py
index 78a2437a4..edaa0878d 100644
--- a/sktree/stats/permutationforest.py
+++ b/sktree/stats/permutationforest.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 from numpy.typing import ArrayLike
 from sklearn.base import MetaEstimatorMixin, clone, is_classifier
@@ -10,7 +12,6 @@
 from sktree._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier, ForestRegressor
 
 from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm
-from typing import Optional
 
 
 class BasePermutationForest(MetaEstimatorMixin):

From c9c22e91218deda424b5a47d43b30744082da50a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 8 Nov 2023 09:31:07 -0500
Subject: [PATCH 20/26] Fix ci

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 ...ot_MI_genuine_hypothesis_testing_forest.py |  6 ---
 .../plot_MI_imbalanced_hyppo_testing.py       |  8 ---
 examples/hypothesis_testing/plot_might_auc.py |  2 +-
 .../hypothesis_testing/plot_might_mv_auc.py   |  4 +-
 sktree/stats/forestht.py                      | 22 ++++++++-
 sktree/stats/tests/test_forestht.py           | 49 ++++++++++++++++---
 6 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
index cf69eb8e8..2ddec2289 100644
--- a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
+++ b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
@@ -108,14 +108,8 @@
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=False,
-    sample_dataset_per_tree=False,
 )
 
-print(
-    f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: "
-    f"{est.sample_dataset_per_tree}"
-)
 # we test for the first feature set, which is important and thus should return a pvalue < 0.05
 stat, pvalue = est.test(
     X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
index c8a5478a4..4d068fb8b 100644
--- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
+++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
@@ -132,16 +132,10 @@ def make_multiview_classification(
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=False,
-    sample_dataset_per_tree=False,
 )
 
 mv_results = dict()
 
-print(
-    f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: "
-    f"{est.sample_dataset_per_tree}"
-)
 # we test for the first feature set, which is important and thus should return a pvalue < 0.05
 stat, pvalue = est.test(
     X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats
@@ -177,8 +171,6 @@ def make_multiview_classification(
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=False,
-    sample_dataset_per_tree=False,
 )
 
 rf_results = dict()
diff --git a/examples/hypothesis_testing/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py
index fd032972a..74ea79abd 100644
--- a/examples/hypothesis_testing/plot_might_auc.py
+++ b/examples/hypothesis_testing/plot_might_auc.py
@@ -84,7 +84,7 @@
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=True,
+    permute_forest_fraction=1.0 / n_estimators,
     sample_dataset_per_tree=True,
 )
 
diff --git a/examples/hypothesis_testing/plot_might_mv_auc.py b/examples/hypothesis_testing/plot_might_mv_auc.py
index d4e573d82..2a492eb7c 100644
--- a/examples/hypothesis_testing/plot_might_mv_auc.py
+++ b/examples/hypothesis_testing/plot_might_mv_auc.py
@@ -72,7 +72,7 @@
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=True,
+    permute_forest_fraction=1.0 / n_estimators,
     sample_dataset_per_tree=True,
 )
 
@@ -104,7 +104,7 @@
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=True,
+    permute_forest_fraction=1.0 / n_estimators,
     sample_dataset_per_tree=True,
 )
 
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index f0518dc5e..81426aa51 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -144,7 +144,15 @@ def __init__(
 
     @property
     def n_estimators(self):
-        return self.estimator_.n_estimators
+        try:
+            return self.estimator_.n_estimators
+        except Exception:
+            return self.permuted_estimator_.n_estimators
+        finally:
+            return self._get_estimator().n_estimators
+
+    def _get_estimator(self):
+        pass
 
     def reset(self):
         class_attributes = dir(type(self))
@@ -318,6 +326,18 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[Arr
         if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier):
             raise RuntimeError("Train test split must occur if not using honest forest classifier.")
 
+        if self.permute_forest_fraction is not None and self.permute_forest_fraction < 0.0:
+            raise RuntimeError("permute_forest_fraction must be non-negative.")
+
+        if (
+            self.permute_forest_fraction is not None
+            and self.permute_forest_fraction * self.n_estimators < 1.0
+        ):
+            raise RuntimeError(
+                "permute_forest_fraction must be greater than 1./n_estimators, "
+                f"got {self.permute_forest_fraction}."
+            )
+
         return X, y, covariate_index
 
     def statistic(
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index 39d5d9468..f1dbd1bf4 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -34,13 +34,14 @@
 @pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
 def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
     n_samples = 50
+    n_estimators = 10
     est = FeatureImportanceForestClassifier(
         estimator=RandomForestClassifier(
-            n_estimators=10,
+            n_estimators=n_estimators,
             random_state=seed,
         ),
         # permute_per_tree=True,
-        permute_forest_fraction=1.0 / n_samples,
+        permute_forest_fraction=1.0 / n_estimators,
         test_size=0.7,
         random_state=seed,
         sample_dataset_per_tree=sample_dataset_per_tree,
@@ -69,17 +70,48 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
     with pytest.raises(RuntimeError, match="Not all covariate_index"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")
 
+    with pytest.raises(
+        RuntimeError, match="permute_forest_fraction must be greater than 1./n_estimators"
+    ):
+        est = FeatureImportanceForestClassifier(
+            estimator=RandomForestClassifier(
+                n_estimators=n_estimators,
+                random_state=seed,
+            ),
+            # permute_per_tree=True,
+            permute_forest_fraction=1.0 / n_samples,
+            test_size=0.7,
+            random_state=seed,
+            sample_dataset_per_tree=sample_dataset_per_tree,
+        )
+        est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse")
+
+    with pytest.raises(RuntimeError, match="permute_forest_fraction must be non-negative."):
+        est = FeatureImportanceForestClassifier(
+            estimator=RandomForestClassifier(
+                n_estimators=n_estimators,
+                random_state=seed,
+            ),
+            # permute_per_tree=True,
+            permute_forest_fraction=-1.0 / n_estimators,
+            test_size=0.7,
+            random_state=seed,
+            sample_dataset_per_tree=sample_dataset_per_tree,
+        )
+        est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse")
+
 
 @pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
 def test_featureimportance_forest_stratified(sample_dataset_per_tree):
     n_samples = 100
+    n_estimators = 10
     est = FeatureImportanceForestClassifier(
         estimator=RandomForestClassifier(
-            n_estimators=10,
+            n_estimators=n_estimators,
             random_state=seed,
         ),
         # permute_per_tree=True,
-        permute_forest_fraction=1.0 / n_samples,
+        permute_forest_fraction=1.0 / n_estimators,
         test_size=0.7,
         random_state=seed,
         sample_dataset_per_tree=sample_dataset_per_tree,
@@ -106,9 +138,10 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree):
 def test_featureimportance_forest_errors():
     # permute_per_tree = False
     sample_dataset_per_tree = True
+    n_estimators = 10
     est = FeatureImportanceForestClassifier(
         estimator=RandomForestClassifier(
-            n_estimators=10,
+            n_estimators=n_estimators,
         ),
         test_size=0.5,
         permute_forest_fraction=None,
@@ -395,14 +428,14 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
     n_features = 5
     X = rng.uniform(size=(n_samples, n_features))
     y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
+    n_estimators = 10
     clf = FeatureImportanceForestClassifier(
         estimator=HonestForestClassifier(
-            n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
+            n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
         # permute_per_tree=True,
-        permute_forest_fraction=1.0 / n_samples,
+        permute_forest_fraction=1.0 / n_estimators,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
     other_clf = FeatureImportanceForestClassifier(

From 2a6cda3eb9bc83ea84b90a1297a9494c6458bb9a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 8 Nov 2023 09:49:32 -0500
Subject: [PATCH 21/26] Remove fluff

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 81426aa51..fa5f88200 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -207,13 +207,6 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False):
             # deterministically
             seeds = self._seeds
 
-            # if sample_separate:
-            #     if self._perm_seeds is None:
-            #         new_rng = np.random.default_rng(np.random.randint(0, 1e6))
-            #         self._perm_seeds = new_rng.integers(
-            #             low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_)
-            #         )
-            #     seeds = self._perm_seeds
             for idx, tree in enumerate(self.estimator_.estimators_):
                 seed = seeds[idx]
 

From b183db18cea0e9683f8fe1ad9e5cabfc1625b2cc Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 8 Nov 2023 10:04:45 -0500
Subject: [PATCH 22/26] Remove any mention of permute_per_tree

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py            |  2 --
 sktree/stats/tests/test_coleman.py  | 13 +++++--------
 sktree/stats/tests/test_forestht.py | 21 +++++----------------
 3 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index fa5f88200..86a5bb32f 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -964,7 +964,6 @@ def __init__(
         random_state=None,
         verbose=0,
         test_size=0.2,
-        # permute_per_tree=False,
         stratify=True,
         sample_dataset_per_tree=False,
         permute_forest_fraction=None,
@@ -975,7 +974,6 @@ def __init__(
             random_state=random_state,
             verbose=verbose,
             test_size=test_size,
-            # permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
             stratify=stratify,
             train_test_split=train_test_split,
diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 676b7a817..d0b40402a 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -52,7 +52,7 @@
                     n_jobs=-1,
                 ),
                 "random_state": seed,
-                "permute_per_tree": False,
+                "permute_per_forest_fraction": None,
                 "sample_dataset_per_tree": False,
             },
             300,  # n_samples
@@ -69,7 +69,7 @@
         #             n_estimators=150,
         #             n_jobs=-1,
         #         ),
-        #         "permute_per_tree": True,
+        #         "permute_per_forest_fraction": 1. / 150,
         #         "sample_dataset_per_tree": True,
         #     },
         #     300,  # n_samples
@@ -86,7 +86,7 @@
                     n_jobs=-1,
                 ),
                 # "random_state": seed,
-                "permute_per_tree": True,
+                "permute_per_forest_fraction": 1.0 / 125,
                 "sample_dataset_per_tree": False,
             },
             300,  # n_samples
@@ -171,8 +171,6 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                     n_estimators=100,
                     n_jobs=-1,
                 ),
-                # "random_state": seed,
-                "permute_per_tree": False,
                 "sample_dataset_per_tree": False,
             },
             600,  # n_samples
@@ -190,7 +188,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
         #             n_jobs=-1,
         #         ),
         #         # "random_state": seed,
-        #         "permute_per_tree": True,
+        #         "permute_per_forest_fraction": 1. / 150,
         #         "sample_dataset_per_tree": True,
         #     },
         #     600,  # n_samples
@@ -206,8 +204,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                     n_estimators=100,
                     n_jobs=-1,
                 ),
-                # "random_state": seed,
-                "permute_per_tree": True,
+                "permute_per_forest_fraction": 1.0 / 100,
                 "sample_dataset_per_tree": False,
             },
             600,  # n_samples
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index f1dbd1bf4..a242193b1 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -40,8 +40,7 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
             n_estimators=n_estimators,
             random_state=seed,
         ),
-        # permute_per_tree=True,
-        permute_forest_fraction=1.0 / n_estimators,
+        permute_forest_fraction=1.0 / n_estimators * 5,
         test_size=0.7,
         random_state=seed,
         sample_dataset_per_tree=sample_dataset_per_tree,
@@ -78,7 +77,6 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
                 n_estimators=n_estimators,
                 random_state=seed,
             ),
-            # permute_per_tree=True,
             permute_forest_fraction=1.0 / n_samples,
             test_size=0.7,
             random_state=seed,
@@ -92,8 +90,7 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
                 n_estimators=n_estimators,
                 random_state=seed,
             ),
-            # permute_per_tree=True,
-            permute_forest_fraction=-1.0 / n_estimators,
+            permute_forest_fraction=-1.0 / n_estimators * 5,
             test_size=0.7,
             random_state=seed,
             sample_dataset_per_tree=sample_dataset_per_tree,
@@ -110,8 +107,7 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree):
             n_estimators=n_estimators,
             random_state=seed,
         ),
-        # permute_per_tree=True,
-        permute_forest_fraction=1.0 / n_estimators,
+        permute_forest_fraction=1.0 / n_estimators * 5,
         test_size=0.7,
         random_state=seed,
         sample_dataset_per_tree=sample_dataset_per_tree,
@@ -136,7 +132,6 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree):
 
 
 def test_featureimportance_forest_errors():
-    # permute_per_tree = False
     sample_dataset_per_tree = True
     n_estimators = 10
     est = FeatureImportanceForestClassifier(
@@ -187,7 +182,7 @@ def test_iris_pauc_statistic(
     limit = 0.1
     max_features = "sqrt"
     n_repeats = 200
-    n_estimators = 100
+    n_estimators = 25
     test_size = 0.2
 
     # Check consistency on dataset iris.
@@ -254,7 +249,6 @@ def test_iris_pauc_statistic(
                 n_estimators=10,
             ),
             random_state=seed,
-            # permute_per_tree=False,
             permute_forest_fraction=None,
             sample_dataset_per_tree=False,
         ),
@@ -376,7 +370,6 @@ def test_sample_size_consistency_of_estimator_indices_(
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
-        # permute_per_tree=permute_per_tree,
         permute_forest_fraction=permute_forest_fraction,
         sample_dataset_per_tree=sample_dataset_per_tree,
         stratify=False,
@@ -423,7 +416,7 @@ def test_sample_size_consistency_of_estimator_indices_(
 
 @pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
 @pytest.mark.parametrize("seed", [None, 0])
-def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree):
+def test_sample_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree):
     n_samples = 100
     n_features = 5
     X = rng.uniform(size=(n_samples, n_features))
@@ -434,7 +427,6 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
             n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
-        # permute_per_tree=True,
         permute_forest_fraction=1.0 / n_estimators,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
@@ -443,7 +435,6 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
         ),
         test_size=0.5,
-        # permute_per_tree=False,
         permute_forest_fraction=None,
         sample_dataset_per_tree=sample_dataset_per_tree,
     )
@@ -500,7 +491,6 @@ def test_small_dataset_independent(seed):
             n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5
         ),
         test_size=0.2,
-        # permute_per_tree=False,
         permute_forest_fraction=None,
         sample_dataset_per_tree=False,
     )
@@ -534,7 +524,6 @@ def test_small_dataset_dependent(seed):
             n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5
         ),
         test_size=0.2,
-        # permute_per_tree=False,
         permute_forest_fraction=None,
         sample_dataset_per_tree=False,
     )

From 59ae89be645c6e0089989fcef47c76d876adc1f0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 9 Nov 2023 11:47:19 -0500
Subject: [PATCH 23/26] Fix slow test

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index d0b40402a..889da8af6 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -52,7 +52,7 @@
                     n_jobs=-1,
                 ),
                 "random_state": seed,
-                "permute_per_forest_fraction": None,
+                "permute_forest_fraction": None,
                 "sample_dataset_per_tree": False,
             },
             300,  # n_samples
@@ -69,7 +69,7 @@
         #             n_estimators=150,
         #             n_jobs=-1,
         #         ),
-        #         "permute_per_forest_fraction": 1. / 150,
+        #         "permute_forest_fraction": 1. / 150,
         #         "sample_dataset_per_tree": True,
         #     },
         #     300,  # n_samples
@@ -86,7 +86,7 @@
                     n_jobs=-1,
                 ),
                 # "random_state": seed,
-                "permute_per_forest_fraction": 1.0 / 125,
+                "permute_forest_fraction": 1.0 / 125,
                 "sample_dataset_per_tree": False,
             },
             300,  # n_samples
@@ -188,7 +188,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
         #             n_jobs=-1,
         #         ),
         #         # "random_state": seed,
-        #         "permute_per_forest_fraction": 1. / 150,
+        #         "permute_forest_fraction": 1. / 150,
         #         "sample_dataset_per_tree": True,
         #     },
         #     600,  # n_samples
@@ -204,7 +204,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                     n_estimators=100,
                     n_jobs=-1,
                 ),
-                "permute_per_forest_fraction": 1.0 / 100,
+                "permute_forest_fraction": 1.0 / 100,
                 "sample_dataset_per_tree": False,
             },
             600,  # n_samples

From 2e1d53b88eb67592ad4ff3a42ca81c6a4fc66e0c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 9 Nov 2023 12:04:14 -0500
Subject: [PATCH 24/26] Try to fix slow

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 889da8af6..7f73e6a32 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -199,7 +199,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
             FeatureImportanceForestClassifier,
             {
                 "estimator": RandomForestClassifier(
-                    max_features="sqrt",
+                    max_features=0.3,
                     # random_state=seed,
                     n_estimators=100,
                     n_jobs=-1,

From f3aa7d76e016be9c69d673a85f41f2233302fa42 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 9 Nov 2023 12:33:18 -0500
Subject: [PATCH 25/26] Update

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 7f73e6a32..2ce888afe 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -199,12 +199,12 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
             FeatureImportanceForestClassifier,
             {
                 "estimator": RandomForestClassifier(
-                    max_features=0.3,
+                    max_features='sqrt',
                     # random_state=seed,
                     n_estimators=100,
                     n_jobs=-1,
                 ),
-                "permute_forest_fraction": 1.0 / 100,
+                "permute_forest_fraction": 0.3,
                 "sample_dataset_per_tree": False,
             },
             600,  # n_samples

From 9d0f2db30f996c9d078c0b034661d16e0150e9bd Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 9 Nov 2023 13:04:56 -0500
Subject: [PATCH 26/26] Lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 2ce888afe..69efbe378 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -199,7 +199,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
             FeatureImportanceForestClassifier,
             {
                 "estimator": RandomForestClassifier(
-                    max_features='sqrt',
+                    max_features="sqrt",
                     # random_state=seed,
                     n_estimators=100,
                     n_jobs=-1,