diff --git a/doc/api.rst b/doc/api.rst
index dd5ba611c..0c112b384 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -145,10 +145,6 @@ tree models.
 .. autosummary::
    :toctree: generated/
 
-   FeatureImportanceForestRegressor
-   FeatureImportanceForestClassifier
-   PermutationForestClassifier
-   PermutationForestRegressor
    build_coleman_forest
    build_permutation_forest
    build_oob_forest
diff --git a/doc/whats_new/v0.5.rst b/doc/whats_new/v0.5.rst
index ccb8892e0..c30e50086 100644
--- a/doc/whats_new/v0.5.rst
+++ b/doc/whats_new/v0.5.rst
@@ -26,7 +26,7 @@ Changelog
   handles the case where there is one feature view that is exhausted, and
   another that is not for ``apply_max_features_per_feature_set = False``,
   by `Adam Li`_ (:pr:`#183`).
-- |Fix| :class:`sktree.stats.FeatureImportanceForestClassifier` now correctly passes
+- |Fix| ``sktree.stats.FeatureImportanceForestClassifier`` now correctly passes
   metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`).
 
 Code and Documentation Contributors
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
index f12ce3a2a..79e3a5369 100644
--- a/doc/whats_new/v0.6.rst
+++ b/doc/whats_new/v0.6.rst
@@ -24,7 +24,7 @@ Changelog
   handles the case where there is one feature view that is exhausted, and
   another that is not for ``apply_max_features_per_feature_set = False``,
   by `Adam Li`_ (:pr:`#183`).
-- |Fix| :class:`sktree.stats.FeatureImportanceForestClassifier` now correctly passes
+- |Fix| ``sktree.stats.FeatureImportanceForestClassifier`` now correctly passes
   metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`).
 - |Enhancement| :func:`sktree.datasets.make_trunk_classification` now
   has a generative model based on Trunk and banded covariance, :func:`sktree.datasets.approximate_clf_mutual_information` and
diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst
index f021d1412..7b31bea3f 100644
--- a/doc/whats_new/v0.8.rst
+++ b/doc/whats_new/v0.8.rst
@@ -21,6 +21,10 @@ Changelog
 - |Feature| Simulations in ``sktree.datasets.hyppo`` now throw a warning instead
     of an error when the number of samples is less than the number of dimensions.
     By `Sambit Panda`_ (:pr:`#279`)
+- |API| :class:`sktree.HonestForestClassifier` now has ``bootstrap=True`` as the default
+    argument. By `Adam Li`_ (:pr:`#274`)
+- |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated
+    MIGHT code. By `Adam Li`_ (:pr:`#274`)
 
 Code and Documentation Contributors
 -----------------------------------
diff --git a/examples/hypothesis_testing/README.txt b/examples/hypothesis_testing/README.txt
deleted file mode 100644
index 9a60917ca..000000000
--- a/examples/hypothesis_testing/README.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-.. _hyppo_examples:
-
-Hypothesis testing with decision trees
---------------------------------------
-
-Examples demonstrating how to use decision-trees for statistical hypothesis testing.
diff --git a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
deleted file mode 100644
index 2ddec2289..000000000
--- a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-=========================================================
-Mutual Information for Genuine Hypothesis Testing (MIGHT)
-=========================================================
-
-An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
-multivariate hypothesis test, on simulated datasets. Here, we present a simulation
-of how MIGHT is used to test the hypothesis that a "feature set is important for
-predicting the target". This is a generalization of the framework presented in
-:footcite:`coleman2022scalable`.
-
-We simulate a dataset with 1000 features, 500 samples, and a binary class target
-variable. Within each feature set, there is 500 features associated with one feature
-set, and another 500 features associated with another feature set. One could think of
-these for example as different datasets collected on the same patient in a biomedical setting.
-The first feature set (X) is strongly correlated with the target, and the second
-feature set (W) is weakly correlated with the target (y). Here, we are testing the
-null hypothesis:
-
-- ``H0: I(X; y) - I(X, W; y) = 0``
-- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with
-    respect to ``y``
-
-where ``I`` is mutual information. For example, this could be true in the following settings,
-where X is our informative feature set and W is our uninformative feature set.
-
-- ``W    X -> y``: here ``W`` is completely disconnected from X and y.
-- ``W -> X -> y``: here ``W`` is d-separated from y given X.
-- ``W <- X -> y``: here ``W`` is d-separated from y given X.
-
-We then use MIGHT to test the hypothesis that the first feature set is important for
-predicting the target, and the second feature set is not important for predicting the
-target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`.
-"""
-
-import numpy as np
-from scipy.special import expit
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-
-n_samples = 2000
-n_features_set = 20
-mean = 1.0
-sigma = 2.0
-beta = 5.0
-
-unimportant_mean = 0.0
-unimportant_sigma = 4.5
-
-# first sample the informative features, and then the uniformative features
-X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
-X_important = np.hstack(
-    [
-        X_important,
-        rng.normal(
-            loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
-        ),
-    ]
-)
-
-X_unimportant = rng.normal(
-    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
-)
-X = np.hstack([X_important, X_unimportant])
-
-# simulate the binary target variable
-y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
-
-# %%
-# Perform hypothesis testing using Mutual Information
-# ---------------------------------------------------
-# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
-# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
-# between two forests. One forest is trained on the original dataset, and one forest is trained
-# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
-#
-# The null distribution is then estimated in an efficient manner using the framework of
-# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
-# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
-# computed as the proportion of samples in the null distribution that are less than the
-# observed test statistic.
-
-n_estimators = 100
-max_features = "sqrt"
-test_size = 0.2
-n_repeats = 1000
-n_jobs = -1
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=0.25,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-)
-
-# we test for the first feature set, which is important and thus should return a pvalue < 0.05
-stat, pvalue = est.test(
-    X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
-)
-print(f"Estimated MI difference for the important feature set: {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=np.arange(n_features_set, dtype=int) + n_features_set,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-print(f"Estimated MI difference for the unimportant feature set: {stat} with Pvalue: {pvalue}")
-
-# %%
-# References
-# ----------
-# .. footbibliography::
diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
deleted file mode 100644
index 95c5341ae..000000000
--- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
+++ /dev/null
@@ -1,253 +0,0 @@
-"""
-==============================================================================
-Mutual Information for Genuine Hypothesis Testing (MIGHT) with Imbalanced Data
-==============================================================================
-
-Here, we demonstrate how to do hypothesis testing on highly imbalanced data
-in terms of their feature-set dimensionalities.
-using mutual information as a test statistic. We use the framework of
-:footcite:`coleman2022scalable` to estimate pvalues efficiently.
-
-Here, we simulate two feature sets, one of which is important for the target,
-but significantly smaller in dimensionality than the other feature set, which
-is unimportant for the target. We then use the MIGHT framework to test for
-the importance of each feature set. Instead of leveraging a normal honest random
-forest to estimate the posteriors, here we leverage a multi-view honest random
-forest, with knowledge of the multi-view structure of the ``X`` data.
-
-For other examples of hypothesis testing, see the following:
-
-- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_genuine_hypothesis_testing_forest.py`
-- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py`
-
-For more information on the multi-view decision-tree, see
-:ref:`sphx_glr_auto_examples_multiview_plot_multiview_dtc.py`.
-"""
-
-import matplotlib.pyplot as plt
-import numpy as np
-from sklearn.datasets import make_blobs
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-#
-# Our data will follow the following graphical model:
-#
-# $(X_1 \rightarrow Y; X_2)$
-#
-# where $X_1$ is our signal feature set, $X_2$ is our noise feature set, and $Y$ is our target.
-# $X_1$ will be low-dimensional, but $X_2$ is high-dimensional noise.
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-
-def make_multiview_classification(
-    n_samples=100, n_features_1=10, n_features_2=1000, cluster_std=2.0, seed=None
-):
-    rng = np.random.default_rng(seed=seed)
-
-    # Create a high-dimensional multiview dataset with a low-dimensional informative
-    # subspace in one view of the dataset.
-    X0_first, y0 = make_blobs(
-        n_samples=n_samples,
-        cluster_std=cluster_std,
-        n_features=n_features_1,
-        random_state=rng.integers(1, 10000),
-        centers=1,
-        center_box=(-2, 2.0),
-    )
-
-    X1_first, y1 = make_blobs(
-        n_samples=n_samples,
-        cluster_std=cluster_std,
-        n_features=n_features_1,
-        random_state=rng.integers(1, 10000),
-        centers=1,
-        center_box=(-2.0, 2.0),
-    )
-
-    y1[:] = 1
-
-    # add the second view for y=0 and y=1, which is completely noise
-    X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1)
-    X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1)
-
-    # combine the views and targets
-    X = np.vstack((X0, X1))
-    y = np.hstack((y0, y1)).T
-
-    # add noise to the data
-    X = X + rng.standard_normal(size=X.shape)
-
-    return X, y
-
-
-n_samples = 100
-n_features_1 = 10
-n_features_2 = 10_000
-n_features_views = [n_features_1, n_features_1 + n_features_2]
-
-X, y = make_multiview_classification(
-    n_samples=n_samples,
-    n_features_1=n_features_1,
-    n_features_2=n_features_2,
-    cluster_std=5.0,
-    seed=seed,
-)
-
-print(X.shape, y.shape, n_features_views)
-# %%
-# Perform hypothesis testing using Mutual Information
-# ---------------------------------------------------
-# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
-# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
-# between two forests. One forest is trained on the original dataset, and one forest is trained
-# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
-#
-# The null distribution is then estimated in an efficient manner using the framework of
-# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
-# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
-# computed as the proportion of samples in the null distribution that are less than the
-# observed test statistic.
-
-n_estimators = 100
-max_features = "sqrt"
-test_size = 0.2
-n_repeats = 1000
-n_jobs = -1
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=MultiViewDecisionTreeClassifier(
-            feature_set_ends=n_features_views,
-            apply_max_features_per_feature_set=True,
-        ),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-)
-
-mv_results = dict()
-
-# we test for the overall MI of X vs y
-stat, pvalue = est.test(X, y, metric="mi", n_repeats=n_repeats)
-mv_results["feature_stat"] = stat
-mv_results["feature_pvalue"] = pvalue
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# we test for the first feature set, which is important and thus should return a pvalue < 0.05
-stat, pvalue = est.test(
-    X, y, covariate_index=np.arange(n_features_1, dtype=int), metric="mi", n_repeats=n_repeats
-)
-mv_results["important_feature_stat"] = stat
-mv_results["important_feature_pvalue"] = pvalue
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=np.arange(n_features_1, n_features_2, dtype=int),
-    metric="mi",
-    n_repeats=n_repeats,
-)
-mv_results["unimportant_feature_stat"] = stat
-mv_results["unimportant_feature_pvalue"] = pvalue
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# %%
-# Let's investigate what happens when we do not use a multi-view decision tree.
-# All other parameters are kept the same.
-
-# to ensure max-features is the same across the two models
-max_features = int(np.sqrt(n_features_1) + np.sqrt(n_features_2))
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-)
-
-rf_results = dict()
-
-# we test for the overall MI of X vs y
-stat, pvalue = est.test(X, y, metric="mi", n_repeats=n_repeats)
-rf_results["feature_stat"] = stat
-rf_results["feature_pvalue"] = pvalue
-print("\n\nAnalyzing regular decision tree models:")
-print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}")
-
-# we test for the first feature set, which is important and thus should return a pvalue < 0.05
-stat, pvalue = est.test(
-    X, y, covariate_index=np.arange(n_features_1, dtype=int), metric="mi", n_repeats=n_repeats
-)
-rf_results["important_feature_stat"] = stat
-rf_results["important_feature_pvalue"] = pvalue
-print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=np.arange(n_features_1, n_features_2, dtype=int),
-    metric="mi",
-    n_repeats=n_repeats,
-)
-rf_results["unimportant_feature_stat"] = stat
-rf_results["unimportant_feature_pvalue"] = pvalue
-print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}")
-
-fig, ax = plt.subplots(figsize=(5, 3))
-
-# plot pvalues
-ax.bar(0, rf_results["important_feature_pvalue"], label="Permuting $X_1$ (RF)")
-ax.bar(1, rf_results["unimportant_feature_pvalue"], label="Permuting $X_2$ (RF)")
-ax.bar(2, mv_results["important_feature_pvalue"], label="Permuting $X_1$ (MV)")
-ax.bar(3, mv_results["unimportant_feature_pvalue"], label="Permuting $X_2$ (MV)")
-ax.bar(4, mv_results["feature_pvalue"], label="Overall Feature Set (MV)")
-ax.bar(5, rf_results["feature_pvalue"], label="Overall Feature Set (RF)")
-ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05")
-ax.set(ylabel="Log10(PValue)", xlim=[-0.5, 5.5], yscale="log")
-ax.legend()
-
-fig.tight_layout()
-plt.show()
-
-# %%
-# Discussion
-# ----------
-# We see that the multi-view decision tree is able to detect the important feature set,
-# while the regular decision tree is not. This is because the regular decision tree
-# is not aware of the multi-view structure of the data, and thus is challenged
-# by the imbalanced dimensionality of the feature sets. I.e. it rarely splits on
-# the first low-dimensional feature set, and thus is unable to detect its importance.
-#
-# Note both approaches still fail to reject the null hypothesis (for alpha of 0.05)
-# when testing the unimportant feature set. The difference in the two approaches
-# show the statistical power of the multi-view decision tree is higher than the
-# regular decision tree in this simulation.
-
-# %%
-# References
-# ----------
-# .. footbibliography::
diff --git a/examples/hypothesis_testing/plot_co_MIGHT_alternative.py b/examples/hypothesis_testing/plot_co_MIGHT_alternative.py
deleted file mode 100644
index fd33c335e..000000000
--- a/examples/hypothesis_testing/plot_co_MIGHT_alternative.py
+++ /dev/null
@@ -1,220 +0,0 @@
-"""
-====================================================================================
-Demonstrate Conditional Mutual Information for Genuine Hypothesis Testing (Co-MIGHT)
-====================================================================================
-
-In this example, we demonstrate how to test the conditional mutual information (CMI)
-hypothesis test. To perform CMI testing, we have the hypothesis test:
-
-- $H_0: I(X_2; Y | X_1) = 0$
-- $H_1: I(X_2; Y | X_1) > 0$
-
-Here, we simulate two feature-sets, which are both informative for the target. The
-data-generating process follows the graphical model:
-
-$(X_1 \\rightarrow X_2 \\rightarrow Y; X_1 \\rightarrow Y)$
-
-This means that $I(X_1; Y | X_2) > 0$ if we had a perfect estimate of CMI.
-
-We will demonstrate how to perform the CMI test properly using a conditional
-permutation (compared to a standard permutation) of samples. We specifically
-explore the case where the alternative hypothesis is true and determine when
-the test is able to reject the null hypothesis correctly.
-
-.. note:: This does not exactly implement conditional-independence testing
-    yet, since we do not have a way to estimate the null distribution of
-    $I(X_2; Y | X_1)$. However, we can still demonstrate the power of the
-    test in the case where the alternative hypothesis is true.
-"""
-
-import matplotlib.pyplot as plt
-import numpy as np
-from sklearn.datasets import make_spd_matrix
-
-from sktree import HonestForestClassifier
-from sktree.datasets import make_gaussian_mixture
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-seed = 12345
-rng = np.random.default_rng(seed)
-
-n_samples = 200
-
-n_features = 20
-noise_dims = 80
-class_probs = [0.6, 0.4]
-n_features_2 = 1000 - noise_dims
-
-fixed_center = rng.standard_normal(size=(n_features,))
-centers = [fixed_center, fixed_center]
-
-covariances = [
-    make_spd_matrix(n_dim=n_features, random_state=seed),
-    make_spd_matrix(n_dim=n_features, random_state=seed + 123),
-]
-
-Xs, y = make_gaussian_mixture(
-    centers,
-    covariances,
-    n_samples=n_samples,
-    noise=1.0,
-    noise_dims=noise_dims,
-    shuffle=True,
-    class_probs=class_probs,
-    random_state=seed,
-    transform="linear",
-)
-
-second_X = Xs[0]
-first_X = Xs[1]
-
-print(first_X.shape, second_X.shape)
-X = np.hstack((first_X, second_X, rng.standard_normal(size=(n_samples, n_features_2 - n_features))))
-n_features_ends = [
-    n_features + noise_dims,
-    n_features_2 + n_features + noise_dims * 2,
-]
-
-print(X.shape, y.shape, n_features_ends)
-
-# %%
-# Perform hypothesis testing
-# --------------------------
-# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
-# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
-# between two forests. One forest is trained on the original dataset, and one forest is trained
-# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
-#
-# The null distribution is then estimated in an efficient manner using the framework of
-# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
-# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
-# computed as the proportion of samples in the null distribution that are less than the
-# observed test statistic.
-
-n_estimators = 200
-max_features = 0.3
-test_size = 0.2
-n_repeats = 1000
-n_jobs = -1
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=MultiViewDecisionTreeClassifier(
-            feature_set_ends=n_features_ends,
-            apply_max_features_per_feature_set=True,
-        ),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    sample_dataset_per_tree=False,
-)
-
-mv_results = dict()
-
-# we test for the first feature set, which is lower-dimensional
-covariate_index = np.arange(n_features_ends[0], dtype=int)
-stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats)
-
-mv_results["low_dim_feature_stat"] = stat
-mv_results["low_dim_feature_pvalue"] = pvalue
-print("Analysis with multi-view decision tree.")
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is higher-dimensional
-covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int)
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=covariate_index,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-mv_results["high_dim_feature_stat"] = stat
-mv_results["high_dim_feature_pvalue"] = pvalue
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# %%
-# Let's investigate what happens when we do not use a multi-view decision tree.
-# All other parameters are kept the same.
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    sample_dataset_per_tree=False,
-)
-
-rf_results = dict()
-
-# we test for the first feature set, which is lower-dimensional
-covariate_index = np.arange(n_features_ends[0], dtype=int)
-stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats)
-
-rf_results["low_dim_feature_stat"] = stat
-rf_results["low_dim_feature_pvalue"] = pvalue
-print("\n\nAnalysing now with a regular decision-tree")
-print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is higher-dimensional
-covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int)
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=covariate_index,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-rf_results["high_dim_feature_stat"] = stat
-rf_results["high_dim_feature_pvalue"] = pvalue
-print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}")
-
-fig, ax = plt.subplots(figsize=(5, 3))
-
-# plot pvalues
-ax.bar(0, rf_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (RF)")
-ax.bar(1, rf_results["high_dim_feature_pvalue"], label="High-dim Feature Set (RF)")
-ax.bar(2, mv_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (MV)")
-ax.bar(3, mv_results["high_dim_feature_pvalue"], label="High-dim Feature Set (MV)")
-ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05")
-ax.set(ylabel="Log10(PValue)", xlim=[-0.5, 3.5], yscale="log")
-ax.legend()
-
-fig.tight_layout()
-plt.show()
-
-# %%
-# Discussion
-# ----------
-# In this example, since both feature-sets are in informative for the target, the true
-# answer should be reject the null hypothesis. We see that neither the regular decision
-# tree, nor the multi-view decision tree is able to reject the null hypothesis correctly.
-#
-# However, if we permute the lower-dimensional feature-set, the multi-view decision tree
-# has a lower pvalue than the regular decision tree, which allows us to correctly
-# reject the null hypothesis at lower $\alpha$ levels.
-
-# %%
-# References
-# ----------
-# .. footbibliography::
diff --git a/examples/hypothesis_testing/plot_co_MIGHT_null.py b/examples/hypothesis_testing/plot_co_MIGHT_null.py
deleted file mode 100644
index 2e6325cd1..000000000
--- a/examples/hypothesis_testing/plot_co_MIGHT_null.py
+++ /dev/null
@@ -1,317 +0,0 @@
-"""
-====================================================
-Co-MIGHT when Data Exhibits Conditional Independence
-====================================================
-
-In this example, we demonstrate how to test the conditional mutual information (CMI)
-hypothesis test using conditional mutual information for genuine hypothesis test (Co-MIGHT).
-To perform CMI testing, we have the hypothesis test:
-
-- $H_0: I(X_2; Y | X_1) = 0$
-- $H_1: I(X_2; Y | X_1) > 0$
-
-Here, we simulate two feature-sets, which follow the null-hypothesis with the specific
-setting that $X_2 \\perp \{Y, X_1\}$. We will test using the multi-view decision
-tree to verify that that the null hypothesis is not rejected.
-"""
-
-import matplotlib.pyplot as plt
-import numpy as np
-from sklearn.datasets import make_classification
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-
-n_samples = 200
-n_features_1 = 20
-noise_dims = 80
-n_features_2 = 1000
-
-signal_X, y = make_classification(
-    n_samples=n_samples,
-    n_features=n_features_1 + noise_dims,
-    n_informative=n_features_1,
-    n_redundant=50,
-    n_repeated=0,
-    n_classes=2,
-    class_sep=0.5,
-    flip_y=0.01,
-    shuffle=True,
-    random_state=seed,
-)
-
-# model parameters
-n_estimators = 200
-max_features = 0.3
-test_size = 0.2
-n_repeats = 1000
-n_jobs = -1
-
-# %%
-# Analysis when the null hypothesis is true
-# -----------------------------------------
-# Let's now investigate what happens when the null hypothesis is true. We will simulate
-# data from the graphical model:
-#
-# $(X_1 \\rightarrow Y; X_2)$
-#
-# Here, we either have $X_1$ or $X_2$ informative for the target, but not both. We will
-# then perform hypothesis testing using the same procedure as above. We will test the settings
-# when the high-dimensional feature-set is informative for the target, and when the
-# low-dimensional feature-set is informative for the target.
-
-# Make X_2 high-dimensional
-n_features_ends = [n_features_1 + noise_dims, signal_X.shape[1]]
-_X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, n_features_2))))
-X = _X.copy()
-n_features_ends[1] = X.shape[1]
-
-print(X.shape, y.shape, n_features_ends)
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=MultiViewDecisionTreeClassifier(
-            feature_set_ends=n_features_ends,
-            apply_max_features_per_feature_set=True,
-        ),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    sample_dataset_per_tree=False,
-)
-
-rf_est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-)
-
-rf_results = dict()
-mv_results = dict()
-
-# we test for the first feature set, which is lower-dimensional
-covariate_index = np.arange(n_features_ends[0], dtype=int)
-stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats)
-
-mv_results["low_dim_feature_stat"] = stat
-mv_results["low_dim_feature_pvalue"] = pvalue
-print("\n\nImportant feature-set is low-dimensional")
-print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is higher-dimensional
-covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int)
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=covariate_index,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-mv_results["high_dim_feature_stat"] = stat
-mv_results["high_dim_feature_pvalue"] = pvalue
-print(
-    f"Estimated MI difference testing second view (does not have dependency): "
-    f"{stat} with Pvalue: {pvalue}"
-)
-
-# %%
-# Now, we will compare with using a standard decision tree classifier as our base model.
-
-covariate_index = np.arange(n_features_ends[0], dtype=int)
-stat, pvalue = rf_est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats)
-
-rf_results["low_dim_feature_stat"] = stat
-rf_results["low_dim_feature_pvalue"] = pvalue
-print("\n\nComparing with random forest.")
-print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is higher-dimensional
-covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int)
-stat, pvalue = rf_est.test(
-    X,
-    y,
-    covariate_index=covariate_index,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-rf_results["high_dim_feature_stat"] = stat
-rf_results["high_dim_feature_pvalue"] = pvalue
-print(
-    f"Estimated MI difference testing second view (does not have dependency): "
-    f"{stat} with Pvalue: {pvalue}"
-)
-
-fig, ax = plt.subplots(figsize=(5, 3))
-
-# plot pvalues
-ax.bar(0, rf_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (RF)", color="black")
-ax.bar(1, rf_results["high_dim_feature_pvalue"], label="High-dim Feature Set (RF)", color="gray")
-ax.bar(2, mv_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (MV)", color="green")
-ax.bar(3, mv_results["high_dim_feature_pvalue"], label="High-dim Feature Set (MV)", color="blue")
-ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05")
-ax.set(
-    ylabel="Log10(PValue)",
-    xlim=[-0.5, 3.5],
-    yscale="log",
-    title="Signal Feature-set is Low-dimensional",
-)
-ax.legend()
-
-fig.tight_layout()
-plt.show()
-
-# %%
-# Now, we will make the informative feature-set, $X_1$, high-dimensional
-# and verify that the null hypothesis is not rejected still.
-_X = np.hstack(
-    (
-        signal_X,
-        rng.standard_normal(size=(n_samples, n_features_2 - signal_X.shape[1])),
-        rng.standard_normal(size=(n_samples, n_features_1 + noise_dims)),
-    )
-)
-X = _X.copy()
-n_features_ends = [n_features_2, X.shape[1]]
-print("\n\nSetting important feature-set to be high-dimensional.")
-print(X.shape, n_features_ends)
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=MultiViewDecisionTreeClassifier(
-            feature_set_ends=n_features_ends,
-            apply_max_features_per_feature_set=True,
-        ),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-)
-
-rf_est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=0.5,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-)
-
-mv_results = dict()
-rf_results = dict()
-
-# we test for the first feature set, which is lower-dimensional
-covariate_index = np.arange(n_features_ends[0], dtype=int)
-stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats)
-
-mv_results["high_dim_feature_stat"] = stat
-mv_results["high_dim_feature_pvalue"] = pvalue
-print("\n\nImportant feature-set is high-dimensional")
-print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is higher-dimensional
-covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int)
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=covariate_index,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-mv_results["low_dim_feature_stat"] = stat
-mv_results["low_dim_feature_pvalue"] = pvalue
-print(
-    f"Estimated MI difference testing second view (does not have dependency): "
-    f"{stat} with Pvalue: {pvalue}"
-)
-
-# %%
-# Again, we compare to using a standard decision tree classifier as our base model.
-
-covariate_index = np.arange(n_features_ends[0], dtype=int)
-stat, pvalue = rf_est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats)
-
-rf_results["low_dim_feature_stat"] = stat
-rf_results["low_dim_feature_pvalue"] = pvalue
-print("\n\nComparing with random forest.")
-print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is higher-dimensional
-covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int)
-stat, pvalue = rf_est.test(
-    X,
-    y,
-    covariate_index=covariate_index,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-rf_results["high_dim_feature_stat"] = stat
-rf_results["high_dim_feature_pvalue"] = pvalue
-print(
-    f"Estimated MI difference testing second view (does not have dependency): "
-    f"{stat} with Pvalue: {pvalue}"
-)
-
-fig, ax = plt.subplots(figsize=(5, 3))
-
-# plot pvalues
-ax.bar(0, rf_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (RF)", color="black")
-ax.bar(1, rf_results["high_dim_feature_pvalue"], label="High-dim Feature Set (RF)", color="black")
-ax.bar(2, mv_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (MV)", color="green")
-ax.bar(3, mv_results["high_dim_feature_pvalue"], label="High-dim Feature Set (MV)", color="green")
-ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05")
-ax.set(
-    ylabel="Log10(PValue)",
-    xlim=[-0.5, 3.5],
-    yscale="log",
-    title="Signal Feature-set is High-dimensional",
-)
-ax.legend()
-
-fig.tight_layout()
-plt.show()
-
-
-# %%
-# Discussion
-# ----------
-# We see that when the null hypothesis is true, the multi-view decision tree does not
-# reject the null hypothesis. In addition, it rejects the null hypothesis when there is
-# a dependency between the target and the feature-set even when the feature-set is
-# higher-dimensionality. This is in contrast to the standard decision tree, which
-# fails to reject the null hypothesis when the feature-set with signal is
-# higher-dimensional.
-
-# %%
-# References
-# ----------
-# .. footbibliography::
diff --git a/examples/hypothesis_testing/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py
deleted file mode 100644
index 74ea79abd..000000000
--- a/examples/hypothesis_testing/plot_might_auc.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-===================================================================================
-Compute partial AUC using Mutual Information for Genuine Hypothesis Testing (MIGHT)
-===================================================================================
-
-An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
-multivariate hypothesis test, on simulated datasets. Here, we present a simulation
-of how MIGHT is used to evaluate how a "feature set is important for predicting the target".
-
-We simulate a dataset with 1000 features, 500 samples, and a binary class target
-variable. Within each feature set, there is 500 features associated with one feature
-set, and another 500 features associated with another feature set. One could think of
-these for example as different datasets collected on the same patient in a biomedical setting.
-The first feature set (X) is strongly correlated with the target, and the second
-feature set (W) is weakly correlated with the target (y).
-
-We then use MIGHT to calculate the partial AUC of these sets.
-"""
-
-import numpy as np
-from scipy.special import expit
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-
-n_samples = 1000
-n_features_set = 500
-mean = 1.0
-sigma = 2.0
-beta = 5.0
-
-unimportant_mean = 0.0
-unimportant_sigma = 4.5
-
-# first sample the informative features, and then the uniformative features
-X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
-X_important = np.hstack(
-    [
-        X_important,
-        rng.normal(
-            loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
-        ),
-    ]
-)
-
-X_unimportant = rng.normal(
-    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
-)
-
-# simulate the binary target variable
-y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
-
-# %%
-# Use partial AUC as test statistic
-# ---------------------------------
-# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``.
-
-n_estimators = 125
-max_features = "sqrt"
-metric = "auc"
-test_size = 0.2
-n_jobs = -1
-honest_fraction = 0.7
-max_fpr = 0.1
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=honest_fraction,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    permute_forest_fraction=1.0 / n_estimators,
-    sample_dataset_per_tree=True,
-)
-
-# we test for the first feature set, which is important and thus should return a higher AUC
-stat, posterior_arr, samples = est.statistic(
-    X_important,
-    y,
-    metric=metric,
-    return_posteriors=True,
-    max_fpr=max_fpr,
-)
-
-print(f"ASH-90 / Partial AUC: {stat}")
-print(f"Shape of Observed Samples: {samples.shape}")
-print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}")
-
-# %%
-# Repeat for the second feature set
-# ---------------------------------
-# This feature set has a smaller statistic, which is expected due to its weak correlation.
-
-stat, posterior_arr, samples = est.statistic(
-    X_unimportant,
-    y,
-    metric=metric,
-    return_posteriors=True,
-    max_fpr=max_fpr,
-)
-
-print(f"ASH-90 / Partial AUC: {stat}")
-print(f"Shape of Observed Samples: {samples.shape}")
-print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}")
-
-# %%
-# All posteriors are saved within the model
-# -----------------------------------------
-# Extract the results from the model variables anytime. You can save the model with ``pickle``.
-#
-# ASH-90 / Partial AUC: ``est.observe_stat_``
-#
-# Observed Samples: ``est.observe_samples_``
-#
-# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1)
-#
-# True Labels: ``est.y_true_final_``
diff --git a/examples/hypothesis_testing/plot_might_mv_auc.py b/examples/hypothesis_testing/plot_might_mv_auc.py
deleted file mode 100644
index 2a492eb7c..000000000
--- a/examples/hypothesis_testing/plot_might_mv_auc.py
+++ /dev/null
@@ -1,135 +0,0 @@
-"""
-=====================================================
-Compute partial AUC using multi-view MIGHT (MV-MIGHT)
-=====================================================
-
-An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
-multivariate hypothesis test, on simulated mutli-view datasets. Here, we present
-how to estimate partial AUROC from a multi-view feature set.
-
-We simulate a dataset with 510 features, 1000 samples, and a binary class target variable.
-The first 10 features (X) are strongly correlated with the target, and the second
-feature set (W) is weakly correlated with the target (y).
-
-We then use MV-MIGHT to calculate the partial AUC of these sets.
-"""
-
-import numpy as np
-from scipy.special import expit
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-
-n_samples = 1000
-n_features_set = 500
-mean = 1.0
-sigma = 2.0
-beta = 5.0
-
-unimportant_mean = 0.0
-unimportant_sigma = 4.5
-
-# first sample the informative features, and then the uniformative features
-X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
-X_unimportant = rng.normal(
-    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
-)
-X = np.hstack([X_important, X_unimportant])
-
-# simulate the binary target variable
-y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
-
-# %%
-# Use partial AUC as test statistic
-# ---------------------------------
-# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``.
-
-n_estimators = 125
-max_features = "sqrt"
-metric = "auc"
-test_size = 0.2
-n_jobs = -1
-honest_fraction = 0.5
-max_fpr = 0.1
-
-est_mv = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=MultiViewDecisionTreeClassifier(feature_set_ends=[10, 10 + n_features_set]),
-        honest_fraction=honest_fraction,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    permute_forest_fraction=1.0 / n_estimators,
-    sample_dataset_per_tree=True,
-)
-
-# we test with the multi-view setting, thus should return a higher AUC
-stat, posterior_arr, samples = est_mv.statistic(
-    X,
-    y,
-    metric=metric,
-    return_posteriors=True,
-    max_fpr=max_fpr,
-)
-
-print(f"ASH-90 / Partial AUC: {stat}")
-print(f"Shape of Observed Samples: {samples.shape}")
-print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}")
-
-# %%
-# Repeat without multi-view
-# ---------------------------------
-# This feature set has a smaller statistic, which is expected due to its lack of multi-view setting.
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        honest_fraction=honest_fraction,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    permute_forest_fraction=1.0 / n_estimators,
-    sample_dataset_per_tree=True,
-)
-
-stat, posterior_arr, samples = est.statistic(
-    X,
-    y,
-    metric=metric,
-    return_posteriors=True,
-    max_fpr=max_fpr,
-)
-
-print(f"ASH-90 / Partial AUC: {stat}")
-print(f"Shape of Observed Samples: {samples.shape}")
-print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}")
-
-# %%
-# All posteriors are saved within the model
-# -----------------------------------------
-# Extract the results from the model variables anytime. You can save the model with ``pickle``.
-#
-# ASH-90 / Partial AUC: ``est_mv.observe_stat_``
-#
-# Observed Samples: ``est_mv.observe_samples_``
-#
-# Tree Posteriors for the positive class: ``est_mv.observe_posteriors_``
-# (n_trees, n_samples_test, 1)
-#
-# True Labels: ``est_mv.y_true_final_``
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
index b1647b9ca..484fca881 100644
--- a/sktree/ensemble/_honest_forest.py
+++ b/sktree/ensemble/_honest_forest.py
@@ -402,7 +402,7 @@ def __init__(
         max_features="sqrt",
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
-        bootstrap=False,
+        bootstrap=True,
         oob_score=False,
         n_jobs=None,
         random_state=None,
diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py
index 188694c0a..8ca7a11ea 100644
--- a/sktree/stats/__init__.py
+++ b/sktree/stats/__init__.py
@@ -1,20 +1,13 @@
 from .forestht import (
-    FeatureImportanceForestClassifier,
-    FeatureImportanceForestRegressor,
     build_coleman_forest,
     build_cv_forest,
     build_oob_forest,
     build_permutation_forest,
 )
 from .monte_carlo import PermutationTest
-from .permutationforest import PermutationForestClassifier, PermutationForestRegressor
 from .permuteforest import PermutationHonestForestClassifier
 
 __all__ = [
-    "FeatureImportanceForestClassifier",
-    "FeatureImportanceForestRegressor",
-    "PermutationForestClassifier",
-    "PermutationForestRegressor",
     "PermutationTest",
     "build_cv_forest",
     "build_oob_forest",
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index b84b06973..a03a7dd73 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -1,39 +1,19 @@
 import threading
 from collections import namedtuple
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable
 
 import numpy as np
 from joblib import Parallel, delayed
 from numpy.typing import ArrayLike
-from sklearn.base import MetaEstimatorMixin, clone, is_classifier
+from sklearn.base import clone
 from sklearn.ensemble._base import _partition_estimators
-from sklearn.ensemble._forest import ForestClassifier as sklearnForestClassifier
-from sklearn.ensemble._forest import ForestRegressor as sklearnForestRegressor
 from sklearn.model_selection import StratifiedKFold, train_test_split
 from sklearn.utils.multiclass import type_of_target
-from sklearn.utils.validation import _is_fitted, check_X_y
 
-from .._lib.sklearn.ensemble._forest import (
-    ForestClassifier,
-    ForestRegressor,
-    RandomForestClassifier,
-    RandomForestRegressor,
-    _get_n_samples_bootstrap,
-    _parallel_build_trees,
-)
-from ..ensemble._honest_forest import HonestForestClassifier
-from ..experimental import conditional_resample
-from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
+from .._lib.sklearn.ensemble._forest import ForestClassifier
 from ..tree._classes import DTYPE
 from .permuteforest import PermutationHonestForestClassifier
-from .utils import (
-    METRIC_FUNCTIONS,
-    POSITIVE_METRICS,
-    POSTERIOR_FUNCTIONS,
-    REGRESSOR_METRICS,
-    _compute_null_distribution_coleman,
-    _non_nan_samples,
-)
+from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman
 
 
 def _parallel_predict_proba(predict_proba, X, indices_test):
@@ -48,1156 +28,6 @@ def _parallel_predict_proba(predict_proba, X, indices_test):
     return prediction
 
 
-def _parallel_build_trees_with_sepdata(
-    tree: Union[DecisionTreeClassifier, DecisionTreeRegressor],
-    n_trees: int,
-    idx: int,
-    indices_train: ArrayLike,
-    X: ArrayLike,
-    y: ArrayLike,
-    covariate_index,
-    bootstrap: bool,
-    max_samples,
-    sample_weight: Optional[ArrayLike] = None,
-    class_weight=None,
-    missing_values_in_feature_mask=None,
-    classes=None,
-    random_state=None,
-):
-    """Parallel function to build trees and compute posteriors.
-
-    This inherently assumes that the caller function defines the indices
-    for the training and testing data for each tree.
-    """
-    rng = np.random.default_rng(random_state)
-    X_train = X[indices_train, :]
-    y_train = y[indices_train, ...]
-
-    if bootstrap:
-        n_samples_bootstrap = _get_n_samples_bootstrap(
-            n_samples=X_train.shape[0], max_samples=max_samples
-        )
-    else:
-        n_samples_bootstrap = None
-
-    # XXX: this currently creates a copy of X_train on RAM, which is not ideal
-    # individual tree permutation of y labels
-    if covariate_index is not None:
-        indices = np.arange(X_train.shape[0], dtype=int)
-        # perform permutation of covariates
-        index_arr = rng.choice(indices, size=(X_train.shape[0], 1), replace=False, shuffle=True)
-        perm_X_cov = X_train[index_arr, covariate_index]
-        X_train[:, covariate_index] = perm_X_cov
-
-    tree = _parallel_build_trees(
-        tree,
-        bootstrap,
-        X_train,
-        y_train,
-        sample_weight,
-        idx,
-        n_trees,
-        verbose=0,
-        class_weight=class_weight,
-        n_samples_bootstrap=n_samples_bootstrap,
-        missing_values_in_feature_mask=missing_values_in_feature_mask,
-        classes=classes,
-    )
-    return tree
-
-
-class BaseForestHT(MetaEstimatorMixin):
-    observe_samples_: ArrayLike
-    observe_posteriors_: ArrayLike
-    observe_stat_: float
-    permute_samples_: ArrayLike
-    permute_posteriors_: ArrayLike
-    permute_stat_: float
-
-    def __init__(
-        self,
-        estimator=None,
-        random_state=None,
-        verbose=0,
-        test_size=0.2,
-        stratify=False,
-        conditional_perm=False,
-        sample_dataset_per_tree=False,
-        permute_forest_fraction=None,
-        train_test_split=True,
-    ):
-        self.estimator = estimator
-        self.random_state = random_state
-        self.verbose = verbose
-        self.test_size = test_size
-        self.stratify = stratify
-        self.conditional_perm = conditional_perm
-
-        self.train_test_split = train_test_split
-        # XXX: possibly removing these parameters
-        self.sample_dataset_per_tree = sample_dataset_per_tree
-        self.permute_forest_fraction = permute_forest_fraction
-
-        self.n_samples_test_ = None
-        self._n_samples_ = None
-        self._metric = None
-        self._covariate_index_cache_ = None
-        self._type_of_target_ = None
-        self.n_features_in_ = None
-        self._is_fitted = False
-        self._seeds = None
-        self._perm_seeds = None
-
-    @property
-    def n_estimators(self):
-        try:
-            return self.estimator_.n_estimators
-        except Exception:
-            return self.permuted_estimator_.n_estimators
-        finally:
-            return self._get_estimator().n_estimators
-
-    def reset(self):
-        class_attributes = dir(type(self))
-        instance_attributes = dir(self)
-
-        for attr_name in instance_attributes:
-            if attr_name.endswith("_") and attr_name not in class_attributes:
-                delattr(self, attr_name)
-
-        self.n_samples_test_ = None
-        self._n_samples_ = None
-        self._covariate_index_cache_ = None
-        self._type_of_target_ = None
-        self._metric = None
-        self.n_features_in_ = None
-        self._is_fitted = False
-        self._seeds = None
-        self._y = None
-
-    def _get_estimators_indices(self, stratifier=None, sample_separate=False):
-        indices = np.arange(self._n_samples_, dtype=int)
-
-        # Get drawn indices along both sample and feature axes
-        rng = np.random.default_rng(self.estimator_.random_state)
-
-        if self.permute_forest_fraction is None:
-            permute_forest_fraction = 0.0
-        else:
-            permute_forest_fraction = self.permute_forest_fraction
-
-        # TODO: consolidate how we "sample/permute" per subset of the forest
-        if self.sample_dataset_per_tree or permute_forest_fraction > 0.0:
-            # sample random seeds
-            if self._seeds is None:
-                self._seeds = []
-                self._n_permutations = 0
-
-                for itree in range(self.n_estimators):
-                    # For every N-trees that are defined by permute forest fraction
-                    # we will sample a new seed appropriately
-                    if itree % max(int(permute_forest_fraction * self.n_estimators), 1) == 0:
-                        tree = self.estimator_.estimators_[itree]
-                        if tree.random_state is None:
-                            seed = rng.integers(low=0, high=np.iinfo(np.int32).max)
-                        else:
-                            seed = tree.random_state
-
-                    self._seeds.append(seed)
-            seeds = self._seeds
-
-            for idx, tree in enumerate(self.estimator_.estimators_):
-                seed = seeds[idx]
-
-                # Operations accessing random_state must be performed identically
-                # to those in `_parallel_build_trees()`
-                indices_train, indices_test = train_test_split(
-                    indices,
-                    test_size=self.test_size,
-                    shuffle=True,
-                    stratify=stratifier,
-                    random_state=seed,
-                )
-
-                yield indices_train, indices_test
-        else:
-            if self._seeds is None:
-                if self.estimator_.random_state is None:
-                    self._seeds = rng.integers(low=0, high=np.iinfo(np.int32).max)
-                else:
-                    self._seeds = self.estimator_.random_state
-
-            indices_train, indices_test = train_test_split(
-                indices,
-                shuffle=True,
-                test_size=self.test_size,
-                stratify=stratifier,
-                random_state=self._seeds,
-            )
-
-            for _ in range(self.estimator_.n_estimators):
-                yield indices_train, indices_test
-
-    @property
-    def train_test_samples_(self):
-        """
-        The subset of drawn samples for each base estimator.
-
-        Returns a dynamically generated list of indices identifying
-        the samples used for fitting each member of the ensemble, i.e.,
-        the in-bag samples.
-
-        Note: the list is re-created at each call to the property in order
-        to reduce the object memory footprint by not storing the sampling
-        data. Thus fetching the property may be slower than expected.
-        """
-        if self._n_samples_ is None:
-            raise RuntimeError("The estimator must be fitted before accessing this attribute.")
-
-        # we are not train/test splitting, then
-        if not self.train_test_split:
-            return [
-                (np.arange(self._n_samples_, dtype=int), np.array([], dtype=int))
-                for _ in range(len(self.estimator_.estimators_))
-            ]
-
-        # Stratifier uses a cached _y attribute if available
-        stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None
-
-        return [
-            (indices_train, indices_test)
-            for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier)
-        ]
-
-    def _statistic(
-        self,
-        estimator: ForestClassifier,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: ArrayLike,
-        metric: str,
-        return_posteriors: bool,
-        **metric_kwargs,
-    ):
-        raise NotImplementedError("Subclasses should implement this!")
-
-    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None):
-        X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE)
-        if y.ndim != 2:
-            y = y.reshape(-1, 1)
-
-        if covariate_index is not None:
-            if not isinstance(covariate_index, (list, tuple, np.ndarray)):
-                raise RuntimeError("covariate_index must be an iterable of integer indices")
-            else:
-                if not all(isinstance(idx, (np.integer, int)) for idx in covariate_index):
-                    raise RuntimeError("Not all covariate_index are integer indices")
-
-        if self.test_size * X.shape[0] < 5:
-            raise RuntimeError(
-                f"There are less than 5 testing samples used with "
-                f"test_size={self.test_size} for X ({X.shape})."
-            )
-
-        if self._n_samples_ is not None and X.shape[0] != self._n_samples_:
-            raise RuntimeError(
-                f"X must have {self._n_samples_} samples, got {X.shape[0]}. "
-                f"If running on a new dataset, call the 'reset' method."
-            )
-        if self.n_features_in_ is not None and X.shape[1] != self.n_features_in_:
-            raise RuntimeError(
-                f"X must have {self.n_features_in_} features, got {X.shape[1]}. "
-                f"If running on a new dataset, call the 'reset' method."
-            )
-        if self._type_of_target_ is not None and type_of_target(y) != self._type_of_target_:
-            raise RuntimeError(
-                f"y must have type {self._type_of_target_}, got {type_of_target(y)}. "
-                f"If running on a new dataset, call the 'reset' method."
-            )
-
-        if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier):
-            raise RuntimeError("Train test split must occur if not using honest forest classifier.")
-
-        if self.permute_forest_fraction is not None and self.permute_forest_fraction < 0.0:
-            raise RuntimeError("permute_forest_fraction must be non-negative.")
-
-        if (
-            self.permute_forest_fraction is not None
-            and self.permute_forest_fraction * self.n_estimators < 1.0
-        ):
-            raise RuntimeError(
-                "permute_forest_fraction must be greater than 1./n_estimators, "
-                f"got {self.permute_forest_fraction}."
-            )
-
-        return X, y, covariate_index
-
-    def statistic(
-        self,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: Optional[ArrayLike] = None,
-        metric="mi",
-        return_posteriors: bool = False,
-        check_input: bool = True,
-        **metric_kwargs,
-    ) -> Tuple[float, ArrayLike, ArrayLike]:
-        """Compute the test statistic.
-
-        Parameters
-        ----------
-        X : ArrayLike of shape (n_samples, n_features)
-            The data matrix.
-        y : ArrayLike of shape (n_samples, n_outputs)
-            The target matrix.
-        covariate_index : ArrayLike, optional of shape (n_covariates,)
-            The index array of covariates to shuffle, by default None.
-        metric : str, optional
-            The metric to compute, by default "mse".
-        return_posteriors : bool, optional
-            Whether or not to return the posteriors, by default False.
-        check_input : bool, optional
-            Whether or not to check the input, by default True.
-        **metric_kwargs : dict, optional
-            Additional keyword arguments to pass to the metric function.
-
-        Returns
-        -------
-        stat : float
-            The test statistic.
-        posterior_final : ArrayLike of shape (n_estimators, n_samples_final, n_outputs) or
-            (n_estimators, n_samples_final), optional
-            If ``return_posteriors`` is True, then the posterior probabilities of the
-            samples used in the final test. ``n_samples_final`` is equal to ``n_samples``
-            if all samples are encountered in the test set of at least one tree in the
-            posterior computation.
-        samples : ArrayLike of shape (n_samples_final,), optional
-            The indices of the samples used in the final test. ``n_samples_final`` is
-            equal to ``n_samples`` if all samples are encountered in the test set of at
-            least one tree in the posterior computation.
-        """
-        if check_input:
-            X, y, covariate_index = self._check_input(X, y, covariate_index)
-
-        if self._n_samples_ is None:
-            self._n_samples_, self.n_features_in_ = X.shape
-
-        # Infer type of target y
-        if self._type_of_target_ is None:
-            self._type_of_target_ = type_of_target(y)
-
-        if covariate_index is None:
-            self.estimator_ = self._get_estimator()
-            estimator = self.estimator_
-        else:
-            self.permuted_estimator_ = self._get_estimator()
-            estimator = self.permuted_estimator_
-
-            if not hasattr(self, "estimator_"):
-                self.estimator_ = self._get_estimator()
-
-                # XXX: this can be improved as an extra fit can be avoided, by
-                # just doing error-checking
-                # and then setting the internal meta data structures
-                # first run a dummy fit on the samples to initialize the
-                # internal data structure of the forest
-                if is_classifier(self.estimator_):
-                    _unique_y = []
-                    for axis in range(y.shape[1]):
-                        _unique_y.append(np.unique(y[:, axis]))
-                    unique_y = np.hstack(_unique_y)
-                    if unique_y.ndim > 1 and unique_y.shape[1] == 1:
-                        unique_y = unique_y.ravel()
-                    X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
-                    self.estimator_.fit(X_dummy, unique_y)
-                else:
-                    if y.ndim > 1 and y.shape[1] == 1:
-                        self.estimator_.fit(X[:2], y[:2].ravel())
-                    else:
-                        self.estimator_.fit(X[:2], y[:2])
-
-        # Store a cache of the y variable
-        if is_classifier(estimator):
-            self._y = y.copy()
-
-        # # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking
-        # # and then setting the internal meta data structures
-        # # first run a dummy fit on the samples to initialize the
-        # # internal data structure of the forest
-        if not _is_fitted(estimator) and is_classifier(estimator):
-            _unique_y = []
-            for axis in range(y.shape[1]):
-                _unique_y.append(np.unique(y[:, axis]))
-            unique_y = np.hstack(_unique_y)
-            if unique_y.ndim > 1 and unique_y.shape[1] == 1:
-                unique_y = unique_y.ravel()
-            X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
-            estimator.fit(X_dummy, unique_y)
-        elif not _is_fitted(estimator):
-            if y.ndim > 1 and y.shape[1] == 1:
-                estimator.fit(X[:2], y[:2].ravel())
-            else:
-                estimator.fit(X[:2], y[:2])
-
-        # sampling a separate train/test per tree
-        if self.sample_dataset_per_tree:
-            self.n_samples_test_ = self._n_samples_
-        else:
-            # here we fix a training/testing dataset
-            test_size_ = int(self.test_size * self._n_samples_)
-
-            # Fit each tree and compute posteriors with train test splits
-            self.n_samples_test_ = test_size_
-
-        if self._metric is not None and self._metric != metric:
-            raise RuntimeError(
-                f"Metric must be {self._metric}, got {metric}. "
-                f"If running on a new dataset, call the 'reset' method."
-            )
-        self._metric = metric
-
-        if not is_classifier(estimator) and metric not in REGRESSOR_METRICS:
-            raise RuntimeError(
-                f'Metric must be either "mse" or "mae" if using Regression, got {metric}'
-            )
-
-        if estimator.n_outputs_ > 1 and metric == "auc":
-            raise ValueError("AUC metric is not supported for multi-output")
-
-        return self._statistic(
-            estimator,
-            X,
-            y,
-            covariate_index=covariate_index,
-            metric=metric,
-            return_posteriors=return_posteriors,
-            **metric_kwargs,
-        )
-
-    def test(
-        self,
-        X,
-        y,
-        covariate_index: Optional[ArrayLike] = None,
-        metric: str = "mi",
-        n_repeats: int = 1000,
-        return_posteriors: bool = True,
-        **metric_kwargs,
-    ):
-        """Perform hypothesis test using Coleman method.
-
-        X is split into a training/testing split. Optionally, the covariate index
-        columns are shuffled.
-
-        On the training dataset, two honest forests are trained and then the posterior
-        is estimated on the testing dataset. One honest forest is trained on the
-        permuted dataset and the other is trained on the original dataset.
-
-        Finally, resample the posteriors of the two forests to compute the null
-        distribution of the statistics.
-
-        Parameters
-        ----------
-        X : ArrayLike of shape (n_samples, n_features)
-            The data matrix.
-        y : ArrayLike of shape (n_samples, n_outputs)
-            The target matrix.
-        covariate_index : ArrayLike, optional of shape (n_covariates,)
-            The index array of covariates to shuffle, will shuffle all columns by
-            default (corresponding to None).
-        metric : str, optional
-            The metric to compute, by default "mse".
-        n_repeats : int, optional
-            Number of times to sample the null distribution, by default 1000.
-        return_posteriors : bool, optional
-            Whether or not to return the posteriors, by default True.
-        **metric_kwargs : dict, optional
-            Additional keyword arguments to pass to the metric function.
-
-        Returns
-        -------
-        stat : float
-            The test statistic. To compute the test statistic, take
-            ``permute_stat_`` and subtract ``observe_stat_``.
-        pval : float
-            The p-value of the test statistic.
-        """
-        X, y, covariate_index = self._check_input(X, y, covariate_index)
-
-        if not self._is_fitted:
-            # first compute the test statistic on the un-permuted data
-            observe_stat, observe_posteriors, observe_samples = self.statistic(
-                X,
-                y,
-                covariate_index=None,
-                metric=metric,
-                return_posteriors=return_posteriors,
-                check_input=False,
-                **metric_kwargs,
-            )
-        else:
-            observe_samples = self.observe_samples_
-            observe_posteriors = self.observe_posteriors_
-            observe_stat = self.observe_stat_
-
-        if covariate_index is None:
-            covariate_index = np.arange(X.shape[1], dtype=int)
-
-        # next permute the data
-        permute_stat, permute_posteriors, permute_samples = self.statistic(
-            X,
-            y,
-            covariate_index=covariate_index,
-            metric=metric,
-            return_posteriors=return_posteriors,
-            check_input=False,
-            **metric_kwargs,
-        )
-        self.permute_stat_ = permute_stat
-
-        # Note: at this point, both `estimator` and `permuted_estimator_` should
-        # have been fitted already, so we can now compute on the null by resampling
-        # the posteriors and computing the test statistic on the resampled posteriors
-        if self.sample_dataset_per_tree:
-            metric_star, metric_star_pi = _compute_null_distribution_coleman(
-                y_test=y,
-                y_pred_proba_normal=observe_posteriors,
-                y_pred_proba_perm=permute_posteriors,
-                metric=metric,
-                n_repeats=n_repeats,
-                seed=self.random_state,
-                **metric_kwargs,
-            )
-        else:
-            # If not sampling a new dataset per tree, then we may either be
-            # permuting the covariate index per tree or per forest. If not permuting
-            # there is only one train and test split, so we can just use that
-            _, indices_test = self.train_test_samples_[0]
-            indices_test = observe_samples
-            y_test = y[indices_test, :]
-            y_pred_proba_normal = observe_posteriors[:, indices_test, :]
-            y_pred_proba_perm = permute_posteriors[:, indices_test, :]
-
-            metric_star, metric_star_pi = _compute_null_distribution_coleman(
-                y_test=y_test,
-                y_pred_proba_normal=y_pred_proba_normal,
-                y_pred_proba_perm=y_pred_proba_perm,
-                metric=metric,
-                n_repeats=n_repeats,
-                seed=self.random_state,
-                **metric_kwargs,
-            )
-        # metric^\pi - metric = observed test statistic, which under the
-        # null is normally distributed around 0
-        observe_test_stat = permute_stat - observe_stat
-
-        # metric^\pi_j - metric_j, which is centered at 0
-        null_dist = metric_star_pi - metric_star
-
-        # compute pvalue
-        if metric in POSITIVE_METRICS:
-            pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats)
-        else:
-            pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)
-
-        if return_posteriors:
-            self.observe_posteriors_ = observe_posteriors
-            self.permute_posteriors_ = permute_posteriors
-            self.observe_samples_ = observe_samples
-            self.permute_samples_ = permute_samples
-
-        self.null_dist_ = null_dist
-        return observe_test_stat, pvalue
-
-
-class FeatureImportanceForestRegressor(BaseForestHT):
-    """Forest hypothesis testing with continuous `y` variable.
-
-    Implements the algorithm described in :footcite:`coleman2022scalable`.
-
-    The dataset is split into a training and testing dataset initially. Then there
-    are two forests that are trained: one on the original dataset, and one on the
-    permuted dataset. The dataset is either permuted once, or independently for
-    each tree in the permuted forest. The original test statistic is computed by
-    comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
-
-    Then the output predictions are randomly sampled to recompute the test statistic
-    ``n_repeats`` times. The p-value is computed as the proportion of times the
-    null test statistic is greater than the original test statistic.
-
-    Parameters
-    ----------
-    estimator : object, default=None
-        Type of forest estimator to use. By default `None`, which defaults to
-        :class:`sklearn.ensemble.RandomForestRegressor`.
-
-    random_state : int, RandomState instance or None, default=None
-        Controls both the randomness of the bootstrapping of the samples used
-        when building trees (if ``bootstrap=True``) and the sampling of the
-        features to consider when looking for the best split at each node
-        (if ``max_features < n_features``).
-        See :term:`Glossary <random_state>` for details.
-
-    verbose : int, default=0
-        Controls the verbosity when fitting and predicting.
-
-    test_size : float, default=0.2
-        Proportion of samples per tree to use for the test set.
-
-    sample_dataset_per_tree : bool, default=False
-        Whether to sample the dataset per tree or per forest.
-
-    conditional_perm : bool, default=False
-        Whether or not to conditionally permute the covariate index. If True,
-        then the covariate index is permuted while preserving the joint with respect
-        to the rest of the covariates.
-
-    permute_forest_fraction : float, default=None
-        The fraction of trees to permute the covariate index for. If None, then
-        just one permutation is performed. If sampling a permutation per tree
-        is desirable, then the fraction should be set to ``1. / n_estimators``.
-
-    train_test_split : bool, default=True
-        Whether to split the dataset before passing to the forest.
-
-    Attributes
-    ----------
-    estimator_ : BaseForest
-        The estimator used to compute the test statistic.
-
-    n_samples_test_ : int
-        The number of samples used in the final test set.
-
-    indices_train_ : ArrayLike of shape (n_samples_train,)
-        The indices of the samples used in the training set.
-
-    indices_test_ : ArrayLike of shape (n_samples_test,)
-        The indices of the samples used in the testing set.
-
-    samples_ : ArrayLike of shape (n_samples_final,)
-        The indices of the samples used in the final test set that would slice
-        the original ``(X, y)`` input.
-
-    y_true_final_ : ArrayLike of shape (n_samples_final,)
-        The true labels of the samples used in the final test.
-
-    observe_posteriors_ : ArrayLike of shape (n_estimators, n_samples, n_outputs) or
-        (n_estimators, n_samples, n_classes)
-        The predicted posterior probabilities of the samples used in the final test.
-        For samples that are NaNs for all estimators, means the sample was not used
-        in the test set at all across all trees.
-
-    null_dist_ : ArrayLike of shape (n_repeats,)
-        The null distribution of the test statistic.
-
-    Notes
-    -----
-    This class trains two forests: one on the original dataset, and one on the
-    permuted dataset. The forest from the original dataset is cached and re-used to
-    compute the test-statistic each time the :meth:`test` method is called. However,
-    the forest from the permuted dataset is re-trained each time the :meth:`test` is called
-    if the ``covariate_index`` differs from the previous run.
-
-    To fully start from a new dataset, call the ``reset`` method, which will then
-    re-train both forests upon calling the :meth:`test` and :meth:`statistic` methods.
-
-    References
-    ----------
-    .. footbibliography::
-    """
-
-    def __init__(
-        self,
-        estimator=None,
-        random_state=None,
-        verbose=0,
-        test_size=0.2,
-        sample_dataset_per_tree=False,
-        conditional_perm=False,
-        permute_forest_fraction=None,
-        train_test_split=True,
-    ):
-        super().__init__(
-            estimator=estimator,
-            random_state=random_state,
-            verbose=verbose,
-            test_size=test_size,
-            sample_dataset_per_tree=sample_dataset_per_tree,
-            conditional_perm=conditional_perm,
-            permute_forest_fraction=permute_forest_fraction,
-            train_test_split=train_test_split,
-        )
-
-    def _get_estimator(self):
-        if self.estimator is None:
-            estimator_ = RandomForestRegressor()
-        elif not isinstance(self.estimator, (ForestRegressor, sklearnForestRegressor)):
-            raise RuntimeError(f"Estimator must be a ForestRegressor, got {type(self.estimator)}")
-        else:
-            estimator_ = self.estimator
-        return clone(estimator_)
-
-    def statistic(
-        self,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: Optional[ArrayLike] = None,
-        metric="mse",
-        return_posteriors: bool = False,
-        check_input: bool = True,
-        **metric_kwargs,
-    ):
-        """Compute the test statistic.
-
-        Parameters
-        ----------
-        X : ArrayLike of shape (n_samples, n_features)
-            The data matrix.
-        y : ArrayLike of shape (n_samples, n_outputs)
-            The target matrix.
-        covariate_index : ArrayLike, optional of shape (n_covariates,)
-            The index array of covariates to shuffle, by default None.
-        metric : str, optional
-            The metric to compute, by default "mse".
-        return_posteriors : bool, optional
-            Whether or not to return the posteriors, by default False.
-        check_input : bool, optional
-            Whether or not to check the input, by default True.
-        **metric_kwargs : dict, optional
-            Additional keyword arguments to pass to the metric function.
-
-        Returns
-        -------
-        stat : float
-            The test statistic.
-        posterior_final : ArrayLike of shape (n_estimators, n_samples_final, n_outputs) or
-            (n_estimators, n_samples_final), optional
-            If ``return_posteriors`` is True, then the posterior probabilities of the
-            samples used in the final test. ``n_samples_final`` is equal to ``n_samples``
-            if all samples are encountered in the test set of at least one tree in the
-            posterior computation.
-        samples : ArrayLike of shape (n_samples_final,), optional
-            The indices of the samples used in the final test. ``n_samples_final`` is
-            equal to ``n_samples`` if all samples are encountered in the test set of at
-            least one tree in the posterior computation.
-        """
-        return super().statistic(
-            X, y, covariate_index, metric, return_posteriors, check_input, **metric_kwargs
-        )
-
-    def _statistic(
-        self,
-        estimator: ForestClassifier,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: ArrayLike,
-        metric: str,
-        return_posteriors: bool,
-        **metric_kwargs,
-    ):
-        """Helper function to compute the test statistic."""
-        metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]
-        rng = np.random.default_rng(self.random_state)
-
-        posterior_arr = np.full((self.n_estimators, self._n_samples_, estimator.n_outputs_), np.nan)
-
-        # both sampling dataset per tree or permuting per tree requires us to bypass the
-        # sklearn API to fit each tree individually
-        if self.sample_dataset_per_tree or self.permute_forest_fraction:
-            if self.permute_forest_fraction and covariate_index is not None:
-                random_states = [tree.random_state for tree in estimator.estimators_]
-            else:
-                random_states = [estimator.random_state] * len(estimator.estimators_)
-
-            trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
-                delayed(_parallel_build_trees_with_sepdata)(
-                    estimator.estimators_[idx],
-                    len(estimator.estimators_),
-                    idx,
-                    indices_train,
-                    X,
-                    y,
-                    covariate_index,
-                    bootstrap=estimator.bootstrap,
-                    max_samples=estimator.max_samples,
-                    random_state=random_states[idx],
-                )
-                for idx, (indices_train, _) in enumerate(self.train_test_samples_)
-            )
-            estimator.estimators_ = trees
-        else:
-            # fitting a forest will only get one unique train/test split
-            indices_train, indices_test = self.train_test_samples_[0]
-
-            X_train, _ = X[indices_train, :], X[indices_test, :]
-            y_train, _ = y[indices_train, :], y[indices_test, :]
-
-            if covariate_index is not None:
-                # perform permutation of covariates
-                if self.conditional_perm:
-                    X_perm_cov_ind = conditional_resample(
-                        X_train, X_train[:, covariate_index], replace=False, random_state=rng
-                    )
-                    X_train[:, covariate_index] = X_perm_cov_ind
-                else:
-                    n_samples_train = X_train.shape[0]
-                    index_arr = rng.choice(
-                        np.arange(n_samples_train, dtype=int),
-                        size=(n_samples_train, 1),
-                        replace=False,
-                        shuffle=True,
-                    )
-                    X_train[:, covariate_index] = X_train[index_arr, covariate_index]
-
-            if self._type_of_target_ == "binary":
-                y_train = y_train.ravel()
-            estimator.fit(X_train, y_train)
-
-        # TODO: probably a more elegant way of doing this
-        if self.train_test_split:
-            # accumulate the predictions across all trees
-            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(
-                    estimator.estimators_[idx].predict, X, indices_test
-                )
-                for idx, (_, indices_test) in enumerate(self.train_test_samples_)
-            )
-            for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
-                _, indices_test = est_indices
-                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
-        else:
-            all_indices = np.arange(self._n_samples_, dtype=int)
-
-            # accumulate the predictions across all trees
-            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, all_indices)
-                for idx in range(len(estimator.estimators_))
-            )
-            for itree, proba in enumerate(all_proba):
-                posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_outputs_)
-
-        # determine if there are any nans in the final posterior array, when
-        # averaged over the trees
-        samples = _non_nan_samples(posterior_arr)
-
-        # Ignore all NaN values (samples not tested)
-        y_true_final = y[(samples), :]
-
-        # Average all posteriors (n_samples_test, n_outputs) to compute the statistic
-        posterior_forest = np.nanmean(posterior_arr[:, (samples), :], axis=0)
-        stat = metric_func(y_true_final, posterior_forest, **metric_kwargs)
-        if covariate_index is None:
-            # Ignore all NaN values (samples not tested) -> (n_samples_final, n_outputs)
-            # arrays of y and predicted posterior
-            self.observe_samples_ = samples
-            self.y_true_final_ = y_true_final
-            self.observe_posteriors_ = posterior_arr
-            self.observe_stat_ = stat
-            self._is_fitted = True
-
-        if return_posteriors:
-            return stat, posterior_arr, samples
-
-        return stat
-
-
-class FeatureImportanceForestClassifier(BaseForestHT):
-    """Forest hypothesis testing with categorical `y` variable.
-
-    Implements the algorithm described in :footcite:`coleman2022scalable`.
-
-    The dataset is split into a training and testing dataset initially. Then there
-    are two forests that are trained: one on the original dataset, and one on the
-    permuted dataset. The dataset is either permuted once, or independently for
-    each tree in the permuted forest. The original test statistic is computed by
-    comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
-
-    Then the output predictions are randomly sampled to recompute the test statistic
-    ``n_repeats`` times. The p-value is computed as the proportion of times the
-    null test statistic is greater than the original test statistic.
-
-    Parameters
-    ----------
-    estimator : object, default=None
-        Type of forest estimator to use. By default `None`, which defaults to
-        :class:`sklearn.ensemble.RandomForestRegressor`.
-
-    random_state : int, RandomState instance or None, default=None
-        Controls both the randomness of the bootstrapping of the samples used
-        when building trees (if ``bootstrap=True``) and the sampling of the
-        features to consider when looking for the best split at each node
-        (if ``max_features < n_features``).
-        See :term:`Glossary <random_state>` for details.
-
-    verbose : int, default=0
-        Controls the verbosity when fitting and predicting.
-
-    test_size : float, default=0.2
-        Proportion of samples per tree to use for the test set.
-
-    stratify : bool, default=True
-        Whether to stratify the samples by class labels.
-
-    conditional_perm : bool, default=False
-        Whether or not to conditionally permute the covariate index. If True,
-        then the covariate index is permuted while preserving the joint with respect
-        to the rest of the covariates.
-
-    sample_dataset_per_tree : bool, default=False
-        Whether to sample the dataset per tree or per forest.
-
-    permute_forest_fraction : float, default=None
-        The fraction of trees to permute the covariate index for. If None, then
-        just one permutation is performed.
-
-    train_test_split : bool, default=True
-        Whether to split the data into train/test before passing to the forest.
-
-    Attributes
-    ----------
-    estimator_ : BaseForest
-        The estimator used to compute the test statistic.
-
-    n_samples_test_ : int
-        The number of samples used in the final test set.
-
-    indices_train_ : ArrayLike of shape (n_samples_train,)
-        The indices of the samples used in the training set.
-
-    indices_test_ : ArrayLike of shape (n_samples_test,)
-        The indices of the samples used in the testing set.
-
-    samples_ : ArrayLike of shape (n_samples_final,)
-        The indices of the samples used in the final test set that would slice
-        the original ``(X, y)`` input along the rows.
-
-    y_true_final_ : ArrayLike of shape (n_samples_final,)
-        The true labels of the samples used in the final test.
-
-    observe_posteriors_ : ArrayLike of shape (n_estimators, n_samples_final, n_outputs) or
-        (n_estimators, n_samples_final, n_classes)
-        The predicted posterior probabilities of the samples used in the final test.
-
-    null_dist_ : ArrayLike of shape (n_repeats,)
-        The null distribution of the test statistic.
-
-    Notes
-    -----
-    This class trains two forests: one on the original dataset, and one on the
-    permuted dataset. The forest from the original dataset is cached and re-used to
-    compute the test-statistic each time the :meth:`test` method is called. However,
-    the forest from the permuted dataset is re-trained each time the :meth:`test` is called
-    if the ``covariate_index`` differs from the previous run.
-
-    To fully start from a new dataset, call the ``reset`` method, which will then
-    re-train both forests upon calling the :meth:`test` and :meth:`statistic` methods.
-
-    References
-    ----------
-    .. footbibliography::
-    """
-
-    def __init__(
-        self,
-        estimator=None,
-        random_state=None,
-        verbose=0,
-        test_size=0.2,
-        stratify=True,
-        conditional_perm=False,
-        sample_dataset_per_tree=False,
-        permute_forest_fraction=None,
-        train_test_split=True,
-    ):
-        super().__init__(
-            estimator=estimator,
-            random_state=random_state,
-            verbose=verbose,
-            test_size=test_size,
-            sample_dataset_per_tree=sample_dataset_per_tree,
-            stratify=stratify,
-            conditional_perm=conditional_perm,
-            train_test_split=train_test_split,
-            permute_forest_fraction=permute_forest_fraction,
-        )
-
-    def _get_estimator(self):
-        if self.estimator is None:
-            estimator_ = RandomForestClassifier()
-        elif not isinstance(self.estimator, (ForestClassifier, sklearnForestClassifier)):
-            raise RuntimeError(f"Estimator must be a ForestClassifier, got {type(self.estimator)}")
-        else:
-            # self.estimator is an instance of a ForestEstimator
-            estimator_ = self.estimator
-        return clone(estimator_)
-
-    def _statistic(
-        self,
-        estimator: ForestClassifier,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: ArrayLike,
-        metric: str,
-        return_posteriors: bool,
-        **metric_kwargs,
-    ):
-        """Helper function to compute the test statistic."""
-        metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]
-        rng = np.random.default_rng(estimator.random_state)
-
-        if metric in POSTERIOR_FUNCTIONS:
-            predict_posteriors = True
-        else:
-            predict_posteriors = False
-
-        if predict_posteriors:
-            # now initialize posterior array as (n_trees, n_samples_test, n_classes)
-            # XXX: currently assumes n_outputs_ == 1
-            posterior_arr = np.full(
-                (self.n_estimators, self._n_samples_, estimator.n_classes_), np.nan
-            )
-        else:
-            # now initialize posterior array as (n_trees, n_samples_test, n_outputs)
-            posterior_arr = np.full(
-                (self.n_estimators, self._n_samples_, estimator.n_outputs_), np.nan
-            )
-
-        # both sampling dataset per tree or permuting per tree requires us to bypass the
-        # sklearn API to fit each tree individually
-        if self.sample_dataset_per_tree or self.permute_forest_fraction:
-            if self.permute_forest_fraction and covariate_index is not None:
-                random_states = [tree.random_state for tree in estimator.estimators_]
-            else:
-                random_states = [estimator.random_state] * len(estimator.estimators_)
-
-            trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")(
-                delayed(_parallel_build_trees_with_sepdata)(
-                    estimator.estimators_[idx],
-                    len(estimator.estimators_),
-                    idx,
-                    indices_train,
-                    X,
-                    y,
-                    covariate_index,
-                    bootstrap=estimator.bootstrap,
-                    max_samples=estimator.max_samples,
-                    random_state=random_states[idx],
-                )
-                for idx, (indices_train, _) in enumerate(self.train_test_samples_)
-            )
-            estimator.estimators_ = trees
-        else:
-            # fitting a forest will only get one unique train/test split
-            indices_train, indices_test = self.train_test_samples_[0]
-
-            X_train, _ = X[indices_train, :], X[indices_test, :]
-            y_train = y[indices_train, :]
-
-            if covariate_index is not None:
-                # perform permutation of covariates
-                if self.conditional_perm:
-                    X_perm_cov_ind = conditional_resample(
-                        X_train, X_train[:, covariate_index], replace=False, random_state=rng
-                    )
-                    X_train[:, covariate_index] = X_perm_cov_ind
-                else:
-                    n_samples_train = X_train.shape[0]
-                    index_arr = rng.choice(
-                        np.arange(n_samples_train, dtype=int),
-                        size=(n_samples_train, 1),
-                        replace=False,
-                        shuffle=True,
-                    )
-                    X_train[:, covariate_index] = X_train[index_arr, covariate_index]
-
-            if self._type_of_target_ == "binary" or (y.ndim > 1 and y.shape[1] == 1):
-                y_train = y_train.ravel()
-            estimator.fit(X_train, y_train)
-
-        # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,)
-        if predict_posteriors:
-            # all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-            #     delayed(_parallel_predict_proba)(
-            #         estimator.estimators_[idx].predict_proba, X, indices_test
-            #     )
-            #     for idx, (_, indices_test) in enumerate(self.train_test_samples_)
-            # )
-
-            # TODO: probably a more elegant way of doing this
-            if self.train_test_split:
-                # accumulate the predictions across all trees
-                all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                    delayed(_parallel_predict_proba)(
-                        estimator.estimators_[idx].predict_proba, X, indices_test
-                    )
-                    for idx, (_, indices_test) in enumerate(self.train_test_samples_)
-                )
-            else:
-                all_indices = np.arange(self._n_samples_, dtype=int)
-
-                # accumulate the predictions across all trees
-                all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                    delayed(_parallel_predict_proba)(
-                        estimator.estimators_[idx].predict_proba, X, all_indices
-                    )
-                    for idx in range(len(estimator.estimators_))
-                )
-        else:
-            all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)(
-                delayed(_parallel_predict_proba)(
-                    estimator.estimators_[idx].predict, X, indices_test
-                )
-                for idx, (_, indices_test) in enumerate(self.train_test_samples_)
-            )
-        for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)):
-            _, indices_test = est_indices
-
-            if predict_posteriors:
-                if self.train_test_split:
-                    posterior_arr[itree, indices_test, ...] = proba.reshape(
-                        -1, estimator.n_classes_
-                    )
-                else:
-                    posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_classes_)
-            else:
-                posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_)
-
-        if metric == "auc":
-            # at this point, posterior_final is the predicted posterior for only the positive class
-            # as more than one output is not supported.
-            if self._type_of_target_ == "binary":
-                posterior_arr = posterior_arr[..., (1,)]
-            else:
-                raise RuntimeError(
-                    f"AUC metric is not supported for {self._type_of_target_} targets."
-                )
-
-        # determine if there are any nans in the final posterior array, when
-        # averaged over the trees
-        samples = _non_nan_samples(posterior_arr)
-
-        # Ignore all NaN values (samples not tested)
-        y_true_final = y[(samples), :]
-
-        # Average all posteriors (n_samples_test, n_outputs) to compute the statistic
-        posterior_forest = np.nanmean(posterior_arr[:, (samples), :], axis=0)
-        stat = metric_func(y_true_final, posterior_forest, **metric_kwargs)
-
-        if covariate_index is None:
-            # Ignore all NaN values (samples not tested) -> (n_samples_final, n_outputs)
-            # arrays of y and predicted posterior
-            self.observe_samples_ = samples
-            self.y_true_final_ = y_true_final
-            self.observe_posteriors_ = posterior_arr
-            self.observe_stat_ = stat
-            self._is_fitted = True
-
-        if return_posteriors:
-            return stat, posterior_arr, samples
-
-        return stat
-
-
 def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
     """
     This is a utility function for joblib's Parallel.
@@ -1253,7 +83,8 @@ def build_coleman_forest(
     y : ArrayLike of shape (n_samples, n_outputs)
         Binary target, so ``n_outputs`` should be at most 1.
     covariate_index : ArrayLike, optional of shape (n_covariates,)
-        The index array of covariates to shuffle, by default None.
+        The index array of covariates to shuffle, by default None, which
+        defaults to all covariates.
     metric : str, optional
         The metric to compute, by default "s@98", for sensitivity at
         98% specificity.
@@ -1300,7 +131,9 @@ def build_coleman_forest(
         raise RuntimeError(
             f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
         )
-    perm_est, perm_forest_proba = build_oob_forest(perm_est, X, y, verbose=verbose)
+    perm_est, perm_forest_proba = build_oob_forest(
+        perm_est, X, y, verbose=verbose, covariate_index=covariate_index
+    )
 
     # get the number of jobs
     n_jobs = est.n_jobs
diff --git a/sktree/stats/meson.build b/sktree/stats/meson.build
index b99353c80..cd5472386 100644
--- a/sktree/stats/meson.build
+++ b/sktree/stats/meson.build
@@ -2,7 +2,6 @@ python_sources = [
   '__init__.py',
   'forestht.py',
   'utils.py',
-  'permutationforest.py',
   'monte_carlo.py',
   'permuteforest.py',
 ]
diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py
deleted file mode 100644
index 075874b12..000000000
--- a/sktree/stats/permutationforest.py
+++ /dev/null
@@ -1,439 +0,0 @@
-from typing import Optional
-
-import numpy as np
-from numpy.typing import ArrayLike
-from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.ensemble._forest import ForestClassifier as sklearnForestClassifier
-from sklearn.ensemble._forest import ForestRegressor as sklearnForestRegressor
-from sklearn.model_selection import train_test_split
-from sklearn.utils.validation import check_X_y
-
-from sktree._lib.sklearn.ensemble._forest import ForestClassifier, ForestRegressor
-
-from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm
-
-
-class BasePermutationForest(MetaEstimatorMixin):
-    def __init__(
-        self,
-        estimator=None,
-        test_size=0.2,
-        random_state=None,
-        verbose=0,
-    ):
-        self.estimator = estimator
-        self.test_size = test_size
-        self.random_state = random_state
-        self.verbose = verbose
-
-    def reset(self):
-        class_attributes = dir(type(self))
-        instance_attributes = dir(self)
-
-        for attr_name in instance_attributes:
-            if attr_name.endswith("_") and attr_name not in class_attributes:
-                delattr(self, attr_name)
-
-    def _get_estimator(self):
-        pass
-
-    @property
-    def train_test_samples_(self):
-        """
-        The subset of drawn samples for each base estimator.
-
-        Returns a dynamically generated list of indices identifying
-        the samples used for fitting each member of the ensemble, i.e.,
-        the in-bag samples.
-
-        Note: the list is re-created at each call to the property in order
-        to reduce the object memory footprint by not storing the sampling
-        data. Thus fetching the property may be slower than expected.
-        """
-        indices = np.arange(self._n_samples_, dtype=int)
-
-        # Get drawn indices along both sample and feature axes
-        indices_train, indices_test = train_test_split(
-            indices, test_size=self.test_size, shuffle=True, random_state=self.random_state
-        )
-        return indices_train, indices_test
-
-    def _statistic(
-        self,
-        estimator: BaseEstimator,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: Optional[ArrayLike] = None,
-        metric="mse",
-        return_posteriors: bool = False,
-        seed=None,
-        **metric_kwargs,
-    ):
-        """Helper function to compute the test statistic."""
-        metric_func = METRIC_FUNCTIONS[metric]
-        if seed is None:
-            rng = np.random.default_rng(self.random_state)
-        else:
-            rng = np.random.default_rng(seed)
-        indices_train, indices_test = self.train_test_samples_
-        if covariate_index is not None:
-            n_samples = X.shape[0]
-            indices = np.arange(n_samples, dtype=int)
-            # perform permutation of covariates
-            index_arr = rng.choice(indices, size=(n_samples, 1), replace=False, shuffle=False)
-            X = X.copy()
-            X[:, covariate_index] = X[index_arr, covariate_index]
-
-        X_train, X_test = X[indices_train, :], X[indices_test, :]
-        y_train, y_test = y[indices_train, :], y[indices_test, :]
-        if y_train.shape[1] == 1:
-            y_train = y_train.ravel()
-            y_test = y_test.ravel()
-        estimator.fit(X_train, y_train)
-
-        # Either get the predicted value, or the posterior probabilities
-        y_pred = estimator.predict(X_test)
-
-        # set variables to compute metric
-        samples = indices_test
-        y_true_final = y_test
-        posterior_final = y_pred
-
-        stat = metric_func(y_true_final, posterior_final, **metric_kwargs)
-
-        if covariate_index is None:
-            # Ignore all NaN values (samples not tested) -> (n_samples_final, n_outputs)
-            # arrays of y and predicted posterior
-            self.samples_ = samples
-            self.y_true_ = y_true_final
-            self.posterior_final_ = posterior_final
-            self.stat_ = stat
-
-        if return_posteriors:
-            return stat, posterior_final, samples
-
-        return stat
-
-    def statistic(
-        self,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: Optional[ArrayLike] = None,
-        metric="mse",
-        return_posteriors: bool = False,
-        check_input: bool = True,
-        seed=None,
-        **metric_kwargs,
-    ):
-        """Compute the test statistic.
-
-        Parameters
-        ----------
-        X : ArrayLike of shape (n_samples, n_features)
-            The data matrix.
-        y : ArrayLike of shape (n_samples, n_outputs)
-            The target matrix.
-        covariate_index : ArrayLike, optional of shape (n_covariates,)
-            The index array of covariates to shuffle, by default None.
-        metric : str, optional
-            The metric to compute, by default "mse".
-        return_posteriors : bool, optional
-            Whether or not to return the posteriors, by default False.
-        check_input : bool, optional
-            Whether or not to check the input, by default True.
-        seed : int, optional
-            The random seed to use, by default None.
-        **metric_kwargs : dict, optional
-            Keyword arguments to pass to the metric function.
-
-        Returns
-        -------
-        stat : float
-            The test statistic.
-        posterior_final : ArrayLike of shape (n_samples_final, n_outputs), optional
-            If ``return_posteriors`` is True, then the posterior probabilities of the
-            samples used in the final test. ``n_samples_final`` is equal to ``n_samples``
-            if all samples are encountered in the test set of at least one tree in the
-            posterior computation.
-        samples : ArrayLike of shape (n_samples_final,), optional
-            The indices of the samples used in the final test. ``n_samples_final`` is
-            equal to ``n_samples`` if all samples are encountered in the test set of at
-            least one tree in the posterior computation.
-        """
-        if check_input:
-            X, y = check_X_y(X, y, ensure_2d=True, multi_output=True)
-            if y.ndim != 2:
-                y = y.reshape(-1, 1)
-
-        self._n_samples_ = X.shape[0]
-        self.estimator_ = self._get_estimator()
-
-        if is_classifier(self.estimator_):
-            if metric not in METRIC_FUNCTIONS:
-                raise RuntimeError(
-                    f"Metric must be one of {list(METRIC_FUNCTIONS.keys())}, got {metric}"
-                )
-        else:
-            if metric not in REGRESSOR_METRICS:
-                raise RuntimeError(f'Metric must be either "mse" or "mae", got {metric}')
-
-        if covariate_index is None:
-            estimator = self.estimator_
-        else:
-            self.permuted_estimator_ = clone(self.estimator_)
-            estimator = self.permuted_estimator_
-
-        return self._statistic(
-            estimator,
-            X,
-            y,
-            covariate_index=covariate_index,
-            metric=metric,
-            return_posteriors=return_posteriors,
-            seed=seed,
-            **metric_kwargs,
-        )
-
-    def test(
-        self,
-        X: ArrayLike,
-        y: ArrayLike,
-        covariate_index: ArrayLike,
-        metric: str = "mse",
-        n_repeats: int = 1000,
-        return_posteriors: bool = False,
-        **metric_kwargs,
-    ):
-        """Perform hypothesis test using permutation testing.
-
-        Parameters
-        ----------
-        X : ArrayLike of shape (n_samples, n_features)
-            The data matrix.
-        y : ArrayLike of shape (n_samples, n_outputs)
-            The target matrix.
-        covariate_index : ArrayLike of shape (n_covariates,)
-            The covariate indices of ``X`` to shuffle.
-        metric : str, optional
-            Metric to compute, by default "mse".
-        n_repeats : int, optional
-            Number of times to sample the null distribution, by default 1000.
-        return_posteriors : bool, optional
-            Whether or not to return the posteriors, by default False.
-        **metric_kwargs : dict, optional
-            Keyword arguments to pass to the metric function.
-
-        Returns
-        -------
-        observe_stat : float
-            Observed test statistic.
-        pvalue : float
-            Pvalue of the test.
-        """
-        X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True)
-        if y.ndim != 2:
-            y = y.reshape(-1, 1)
-        self._n_samples_ = X.shape[0]
-
-        # train/test split
-        # XXX: could add stratifying by y when y is classification
-        indices_train, indices_test = self.train_test_samples_
-
-        if not hasattr(self, "samples_"):
-            # first compute the test statistic on the un-permuted data
-            observe_stat, _, _ = self.statistic(
-                X,
-                y,
-                covariate_index=None,
-                metric=metric,
-                return_posteriors=True,
-                check_input=False,
-                **metric_kwargs,
-            )
-        else:
-            # observe_samples = self.samples_
-            # observe_posteriors = self.posterior_final_
-            observe_stat = self.stat_
-
-        # compute null distribution of the test statistic
-        # WARNING: this could take a long time, since it fits a new forest
-        null_dist = _compute_null_distribution_perm(
-            X_train=X[indices_train, :],
-            y_train=y[indices_train, :],
-            X_test=X[indices_test, :],
-            y_test=y[indices_test, :],
-            covariate_index=covariate_index,
-            est=self.estimator_,
-            metric=metric,
-            n_repeats=n_repeats,
-            seed=self.random_state,
-        )
-
-        if not return_posteriors:
-            self.null_dist_ = np.array(null_dist)
-        else:
-            self.null_dist_ = np.array([x[0] for x in null_dist])
-            self.posterior_null_ = np.array([x[1] for x in null_dist])
-
-        n_repeats = len(self.null_dist_)
-        pvalue = (1 + (self.null_dist_ < observe_stat).sum()) / (1 + n_repeats)
-        return observe_stat, pvalue
-
-
-class PermutationForestRegressor(BasePermutationForest):
-    """Hypothesis testing of covariates with a permutation forest regressor.
-
-    This implements permutation testing of a null hypothesis using a random forest.
-    The null hypothesis is generated by permuting ``n_repeats`` times the covariate
-    indices and then a random forest is trained for each permuted instance. This
-    is compared to the original random forest that was computed on the regular
-    non-permuted data.
-
-    .. warning:: Permutation testing with forests is computationally expensive.
-        As a result, if you are testing for the importance of feature sets, consider
-        using `sktree.FeatureImportanceForestRegressor` or
-        `sktree.FeatureImportanceForestClassifier` instead, which is
-        much more computationally efficient.
-
-    .. note:: This does not allow testing on the posteriors.
-
-    Parameters
-    ----------
-    estimator : object, default=None
-        Type of forest estimator to use. By default `None`, which defaults to
-        :class:`sklearn.ensemble.RandomForestRegressor` with default parameters.
-
-    test_size : float, default=0.2
-        The proportion of samples to leave out for each tree to compute metric on.
-
-    random_state : int, RandomState instance or None, default=None
-        Controls both the randomness of the bootstrapping of the samples used
-        when building trees (if ``bootstrap=True``) and the sampling of the
-        features to consider when looking for the best split at each node
-        (if ``max_features < n_features``).
-        See :term:`Glossary <random_state>` for details.
-
-    verbose : int, default=0
-        Controls the verbosity when fitting and predicting.
-
-    Attributes
-    ----------
-    samples_ : ArrayLike of shape (n_samples,)
-        The indices of the samples used in the final test.
-
-    y_true_ : ArrayLike of shape (n_samples_final,)
-        The true labels of the samples used in the final test.
-
-    posterior_ : ArrayLike of shape (n_samples_final, n_outputs)
-        The predicted posterior probabilities of the samples used in the final test.
-
-    null_dist_ : ArrayLike of shape (n_repeats,)
-        The null distribution of the test statistic.
-
-    posterior_null_ : ArrayLike of shape (n_samples_final, n_outputs, n_repeats)
-        The posterior probabilities of the samples used in the final test for each
-        permutation for the null distribution.
-    """
-
-    def __init__(
-        self,
-        estimator=None,
-        test_size=0.2,
-        random_state=None,
-        verbose=0,
-    ):
-        super().__init__(
-            estimator=estimator,
-            test_size=test_size,
-            random_state=random_state,
-            verbose=verbose,
-        )
-
-    def _get_estimator(self):
-        if not hasattr(self, "estimator_") and self.estimator is None:
-            estimator_ = RandomForestRegressor()
-        elif not isinstance(self.estimator, (ForestRegressor, sklearnForestRegressor)):
-            raise RuntimeError(f"Estimator must be a ForestRegressor, got {type(self.estimator)}")
-        else:
-            estimator_ = self.estimator
-        return estimator_
-
-
-class PermutationForestClassifier(BasePermutationForest):
-    """Hypothesis testing of covariates with a permutation forest classifier.
-
-    This implements permutation testing of a null hypothesis using a random forest.
-    The null hypothesis is generated by permuting ``n_repeats`` times the covariate
-    indices and then a random forest is trained for each permuted instance. This
-    is compared to the original random forest that was computed on the regular
-    non-permuted data.
-
-    .. warning:: Permutation testing with forests is computationally expensive.
-        As a result, if you are testing for the importance of feature sets, consider
-        using `sktree.FeatureImportanceForestRegressor` or
-        `sktree.FeatureImportanceForestClassifier` instead, which is
-        much more computationally efficient.
-
-    .. note:: This does not allow testing on the posteriors.
-
-    Parameters
-    ----------
-    estimator : object, default=None
-        Type of forest estimator to use. By default `None`, which defaults to
-        :class:`sklearn.ensemble.RandomForestClassifier`.
-
-    test_size : float, default=0.2
-        The proportion of samples to leave out for each tree to compute metric on.
-
-    random_state : int, RandomState instance or None, default=None
-        Controls both the randomness of the bootstrapping of the samples used
-        when building trees (if ``bootstrap=True``) and the sampling of the
-        features to consider when looking for the best split at each node
-        (if ``max_features < n_features``).
-        See :term:`Glossary <random_state>` for details.
-
-    verbose : int, default=0
-        Controls the verbosity when fitting and predicting.
-
-    Attributes
-    ----------
-    samples_ : ArrayLike of shape (n_samples,)
-        The indices of the samples used in the final test.
-
-    y_true_ : ArrayLike of shape (n_samples_final,)
-        The true labels of the samples used in the final test.
-
-    posterior_ : ArrayLike of shape (n_samples_final, n_outputs)
-        The predicted posterior probabilities of the samples used in the final test.
-
-    null_dist_ : ArrayLike of shape (n_repeats,)
-        The null distribution of the test statistic.
-
-    posterior_null_ : ArrayLike of shape (n_samples_final, n_outputs, n_repeats)
-        The posterior probabilities of the samples used in the final test for each
-        permutation for the null distribution.
-    """
-
-    def __init__(
-        self,
-        estimator=None,
-        test_size=0.2,
-        random_state=None,
-        verbose=0,
-    ):
-        super().__init__(
-            estimator=estimator,
-            test_size=test_size,
-            random_state=random_state,
-            verbose=verbose,
-        )
-
-    def _get_estimator(self):
-        if not hasattr(self, "estimator_") and self.estimator is None:
-            estimator_ = RandomForestClassifier()
-        elif not isinstance(self.estimator, (ForestClassifier, sklearnForestClassifier)):
-            raise RuntimeError(f"Estimator must be a ForestClassifier, got {type(self.estimator)}")
-        else:
-            estimator_ = self.estimator
-        return estimator_
diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index e7b1ac2e9..53b2ae073 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -10,67 +10,61 @@
 from flaky import flaky
 from scipy.special import expit
 
-from sktree import RandomForestClassifier, RandomForestRegressor
-from sktree.stats import (
-    FeatureImportanceForestClassifier,
-    FeatureImportanceForestRegressor,
-    PermutationForestClassifier,
-    PermutationForestRegressor,
-)
-
 seed = 12345
 rng = np.random.default_rng(seed)
 
 
+# TODO: implement tests using the new MIGHT functional framework
+@pytest.mark.skip()
 @flaky(max_runs=3)
 @pytest.mark.slowtest
-@pytest.mark.parametrize(
-    "hypotester, model_kwargs, n_samples, n_repeats, test_size",
-    [
-        [
-            PermutationForestRegressor,
-            {
-                "estimator": RandomForestRegressor(
-                    max_features="sqrt",
-                    n_estimators=75,
-                    n_jobs=-1,
-                ),
-                "random_state": seed,
-            },
-            300,
-            50,
-            0.1,
-        ],
-        [
-            FeatureImportanceForestRegressor,
-            {
-                "estimator": RandomForestRegressor(
-                    max_features="sqrt",
-                    n_estimators=125,
-                    n_jobs=-1,
-                ),
-            },
-            300,  # n_samples
-            1000,  # n_repeats
-            0.2,  # test_size
-        ],
-        # XXX: Currently does not work with permute and sample dataset per tree
-        [
-            FeatureImportanceForestRegressor,
-            {
-                "estimator": RandomForestRegressor(
-                    max_features="sqrt",
-                    n_estimators=250,
-                    n_jobs=-1,
-                ),
-                "permute_forest_fraction": 0.5,
-            },
-            300,  # n_samples
-            1000,  # n_repeats
-            0.2,  # test_size
-        ],
-    ],
-)
+# @pytest.mark.parametrize(
+#     "hypotester, model_kwargs, n_samples, n_repeats, test_size",
+#     [
+#         [
+#             PermutationForestRegressor,
+#             {
+#                 "estimator": RandomForestRegressor(
+#                     max_features="sqrt",
+#                     n_estimators=75,
+#                     n_jobs=-1,
+#                 ),
+#                 "random_state": seed,
+#             },
+#             300,
+#             50,
+#             0.1,
+#         ],
+#         [
+#             FeatureImportanceForestRegressor,
+#             {
+#                 "estimator": RandomForestRegressor(
+#                     max_features="sqrt",
+#                     n_estimators=125,
+#                     n_jobs=-1,
+#                 ),
+#             },
+#             300,  # n_samples
+#             1000,  # n_repeats
+#             0.2,  # test_size
+#         ],
+#         # XXX: Currently does not work with permute and sample dataset per tree
+#         [
+#             FeatureImportanceForestRegressor,
+#             {
+#                 "estimator": RandomForestRegressor(
+#                     max_features="sqrt",
+#                     n_estimators=250,
+#                     n_jobs=-1,
+#                 ),
+#                 "permute_forest_fraction": 0.5,
+#             },
+#             300,  # n_samples
+#             1000,  # n_repeats
+#             0.2,  # test_size
+#         ],
+#     ],
+# )
 def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size):
     r"""Test hypothesis testing forests using MSE from linear model simulation.
 
@@ -117,59 +111,60 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
         assert pvalue > 0.05, f"pvalue: {pvalue}"
 
 
+@pytest.mark.skip()
 @flaky(max_runs=3)
 @pytest.mark.slowtest
-@pytest.mark.parametrize(
-    "hypotester, model_kwargs, n_samples, n_repeats, test_size",
-    [
-        [
-            PermutationForestClassifier,
-            {
-                "estimator": RandomForestClassifier(
-                    max_features="sqrt",
-                    n_estimators=50,
-                    n_jobs=-1,
-                ),
-                "random_state": seed,
-            },
-            600,
-            50,
-            1.0 / 6,
-        ],
-        [
-            # XXX: Currently does not work with permute and sample dataset per tree
-            FeatureImportanceForestClassifier,
-            {
-                "estimator": RandomForestClassifier(
-                    max_features="sqrt",
-                    n_estimators=100,
-                    n_jobs=-1,
-                ),
-                "sample_dataset_per_tree": False,
-            },
-            600,  # n_samples
-            1000,  # n_repeats
-            1.0 / 6,  # test_size
-        ],
-        # XXX: Currently does not work with permute and sample dataset per tree
-        [
-            FeatureImportanceForestClassifier,
-            {
-                "estimator": RandomForestClassifier(
-                    max_features="sqrt",
-                    n_estimators=100,
-                    n_jobs=-1,
-                    random_state=rng.integers(0, 1000),
-                ),
-                "permute_forest_fraction": 0.5,
-                "random_state": rng.integers(0, 1000),
-            },
-            600,  # n_samples
-            1000,  # n_repeats
-            1.0 / 6,  # test_size
-        ],
-    ],
-)
+# @pytest.mark.parametrize(
+#     "hypotester, model_kwargs, n_samples, n_repeats, test_size",
+#     [
+#         [
+#             PermutationForestClassifier,
+#             {
+#                 "estimator": RandomForestClassifier(
+#                     max_features="sqrt",
+#                     n_estimators=50,
+#                     n_jobs=-1,
+#                 ),
+#                 "random_state": seed,
+#             },
+#             600,
+#             50,
+#             1.0 / 6,
+#         ],
+#         [
+#             # XXX: Currently does not work with permute and sample dataset per tree
+#             FeatureImportanceForestClassifier,
+#             {
+#                 "estimator": RandomForestClassifier(
+#                     max_features="sqrt",
+#                     n_estimators=100,
+#                     n_jobs=-1,
+#                 ),
+#                 "sample_dataset_per_tree": False,
+#             },
+#             600,  # n_samples
+#             1000,  # n_repeats
+#             1.0 / 6,  # test_size
+#         ],
+#         # XXX: Currently does not work with permute and sample dataset per tree
+#         [
+#             FeatureImportanceForestClassifier,
+#             {
+#                 "estimator": RandomForestClassifier(
+#                     max_features="sqrt",
+#                     n_estimators=100,
+#                     n_jobs=-1,
+#                     random_state=rng.integers(0, 1000),
+#                 ),
+#                 "permute_forest_fraction": 0.5,
+#                 "random_state": rng.integers(0, 1000),
+#             },
+#             600,  # n_samples
+#             1000,  # n_repeats
+#             1.0 / 6,  # test_size
+#         ],
+#     ],
+# )
 def test_correlated_logit_model(hypotester, model_kwargs, n_samples, n_repeats, test_size):
     r"""Test MIGHT using MSE from linear model simulation.
 
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index 848440620..e20de8ea8 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -1,27 +1,16 @@
-import pickle
-from copy import deepcopy
-from itertools import combinations
-from pathlib import Path
-
 import numpy as np
 import pytest
 from flaky import flaky
-from joblib import Parallel, delayed
 from numpy.testing import assert_almost_equal, assert_array_equal
-from scipy.stats import wilcoxon
 from sklearn import datasets
 
-from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor
-from sktree._lib.sklearn.tree import DecisionTreeClassifier
+from sktree import HonestForestClassifier
 from sktree.stats import (
-    FeatureImportanceForestClassifier,
-    FeatureImportanceForestRegressor,
     PermutationHonestForestClassifier,
     build_coleman_forest,
     build_permutation_forest,
 )
-from sktree.stats.utils import _non_nan_samples
-from sktree.tree import MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier
+from sktree.tree import MultiViewDecisionTreeClassifier
 
 # load the iris dataset (n_samples, 4)
 # and randomly permute it
@@ -38,519 +27,97 @@
 iris_y = iris_y[p]
 
 
-@pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
-def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
-    n_samples = 50
-    n_estimators = 10
-    est = FeatureImportanceForestClassifier(
-        estimator=RandomForestClassifier(
-            n_estimators=n_estimators,
-            random_state=seed,
-        ),
-        permute_forest_fraction=1.0 / n_estimators * 5,
-        test_size=0.7,
-        random_state=seed,
-        sample_dataset_per_tree=sample_dataset_per_tree,
-    )
-    est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse")
-
-    assert (
-        len(est.train_test_samples_[0][1]) == n_samples * est.test_size
-    ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}"
-    assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size
-
-    est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mse")
-    assert (
-        len(est.train_test_samples_[0][1]) == n_samples * est.test_size
-    ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}"
-    assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size
-
-    # covariate index should work with mse
-    est.reset()
-    est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse")
-    with pytest.raises(RuntimeError, match="Metric must be"):
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi")
-
-    # covariate index should work with mse
-    est.reset()
-    est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse")
-    with pytest.raises(RuntimeError, match="Metric must be"):
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi")
-
-    # covariate index must be an iterable
-    est.reset()
-    with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi")
-
-    # covariate index must be an iterable of ints
-    est.reset()
-    with pytest.raises(RuntimeError, match="Not all covariate_index"):
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")
+@pytest.mark.parametrize("seed", [10, 0])
+def test_small_dataset_independent(seed):
+    # XXX: unit test interestingly does not work for MI, possibly due to bias
+    bootstrap = True
+    n_samples = 100
+    n_features = 1000
+    n_estimators = 100
 
-    with pytest.raises(
-        RuntimeError, match="permute_forest_fraction must be greater than 1./n_estimators"
-    ):
-        est = FeatureImportanceForestClassifier(
-            estimator=RandomForestClassifier(
-                n_estimators=n_estimators,
-                random_state=seed,
-            ),
-            permute_forest_fraction=1.0 / n_samples,
-            test_size=0.7,
-            random_state=seed,
-            sample_dataset_per_tree=sample_dataset_per_tree,
-        )
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse")
-
-    with pytest.raises(RuntimeError, match="permute_forest_fraction must be non-negative."):
-        est = FeatureImportanceForestClassifier(
-            estimator=RandomForestClassifier(
-                n_estimators=n_estimators,
-                random_state=seed,
-            ),
-            permute_forest_fraction=-1.0 / n_estimators * 5,
-            test_size=0.7,
-            random_state=seed,
-            sample_dataset_per_tree=sample_dataset_per_tree,
-        )
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse")
-
-
-@pytest.mark.parametrize("covariate_index", [None, [0, 1]])
-def test_featureimportance_forest_statistic_with_covariate_index(covariate_index):
-    """Tests that calling `est.statistic` with covariate_index defined works.
-
-    There should be no issue calling `est.statistic` with covariate_index defined.
-    """
-    n_estimators = 10
-    n_samples = 10
+    rng = np.random.default_rng(seed)
+    X = rng.standard_normal(size=(n_samples, n_features))
+    y = np.vstack(
+        (np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1)))
+    )  # Binary classification
 
-    est = FeatureImportanceForestClassifier(
-        estimator=RandomForestClassifier(
-            n_estimators=n_estimators,
-            random_state=seed,
-        ),
-        permute_forest_fraction=1.0 / n_estimators * 5,
-        test_size=0.7,
+    clf = HonestForestClassifier(
+        n_estimators=n_estimators,
         random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=bootstrap,
+        max_samples=1.6,
+        max_features=0.3,
+        stratify=True,
     )
-    est.statistic(
-        iris_X[:n_samples], iris_y[:n_samples], covariate_index=covariate_index, metric="mi"
-    )
-
-
-@pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
-def test_featureimportance_forest_stratified(sample_dataset_per_tree):
-    n_samples = 100
-    n_estimators = 10
-    est = FeatureImportanceForestClassifier(
-        estimator=RandomForestClassifier(
-            n_estimators=n_estimators,
-            random_state=seed,
-        ),
-        permute_forest_fraction=1.0 / n_estimators * 5,
-        test_size=0.7,
+    perm_clf = PermutationHonestForestClassifier(
+        n_estimators=n_estimators,
         random_state=seed,
-        sample_dataset_per_tree=sample_dataset_per_tree,
-    )
-    est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")
-
-    _, indices_test = est.train_test_samples_[0]
-    y_test = iris_y[indices_test]
-
-    assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), (
-        f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}"
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=bootstrap,
+        max_samples=1.6,
+        max_features=0.3,
+        stratify=True,
     )
-
-    est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi")
-
-    _, indices_test = est.train_test_samples_[0]
-    y_test = iris_y[indices_test]
-
-    assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), (
-        f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}"
+    result = build_coleman_forest(
+        clf, perm_clf, X, y, n_repeats=1000, metric="s@98", return_posteriors=False, seed=seed
     )
+    print(result.observe_stat, result.permuted_stat, result.pvalue, result.observe_test_stat)
+    assert_almost_equal(np.abs(result.observe_test_stat), 0.0, decimal=1)
+    assert result.pvalue > 0.05, f"{result.pvalue}"
 
-
-def test_featureimportance_forest_errors():
-    sample_dataset_per_tree = True
-    n_estimators = 10
-    est = FeatureImportanceForestClassifier(
-        estimator=RandomForestClassifier(
-            n_estimators=n_estimators,
+    # now permute only some of the features
+    feature_set_ends = [3, n_features]
+    clf = HonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=bootstrap,
+        max_features=0.3,
+        max_samples=1.6,
+        stratify=True,
+        tree_estimator=MultiViewDecisionTreeClassifier(
+            feature_set_ends=feature_set_ends,
+            apply_max_features_per_feature_set=True,
         ),
-        test_size=0.5,
-        permute_forest_fraction=None,
-        sample_dataset_per_tree=sample_dataset_per_tree,
     )
-    with pytest.raises(RuntimeError, match="The estimator must be fitted"):
-        est.train_test_samples_
-
-    with pytest.raises(RuntimeError, match="There are less than 5 testing samples"):
-        est.statistic(iris_X[:5], iris_y[:5])
-
-    est = FeatureImportanceForestClassifier(estimator=RandomForestRegressor, test_size=0.5)
-    with pytest.raises(RuntimeError, match="Estimator must be"):
-        est.statistic(iris_X[:20], iris_y[:20])
-
-    est = FeatureImportanceForestRegressor(estimator=RandomForestClassifier, test_size=0.5)
-    with pytest.raises(RuntimeError, match="Estimator must be"):
-        est.statistic(iris_X[:20], iris_y[:20])
-
-
-@flaky(max_runs=2)
-@pytest.mark.parametrize("criterion", ["gini", "entropy"])
-@pytest.mark.parametrize("honest_prior", ["empirical", "uniform"])
-@pytest.mark.parametrize(
-    "estimator",
-    [
-        None,
-        DecisionTreeClassifier(),
-        ObliqueDecisionTreeClassifier(),
-    ],
-)
-@pytest.mark.parametrize(
-    "permute_forest_fraction",
-    [
-        None,
-        0.5,
-    ],
-)
-@pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
-def test_iris_pauc_statistic(
-    criterion, honest_prior, estimator, permute_forest_fraction, sample_dataset_per_tree
-):
-    limit = 0.1
-    max_features = "sqrt"
-    n_repeats = 200
-    n_estimators = 25
-    test_size = 0.2
-
-    # Check consistency on dataset iris.
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            criterion=criterion,
-            n_estimators=n_estimators,
-            max_features=max_features,
-            tree_estimator=estimator,
-            honest_prior=honest_prior,
-            random_state=0,
-            n_jobs=1,
+    perm_clf = PermutationHonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=bootstrap,
+        max_samples=1.6,
+        max_features=0.3,
+        stratify=True,
+        tree_estimator=MultiViewDecisionTreeClassifier(
+            feature_set_ends=feature_set_ends,
+            apply_max_features_per_feature_set=True,
         ),
-        test_size=test_size,
-        sample_dataset_per_tree=sample_dataset_per_tree,
-        permute_forest_fraction=permute_forest_fraction,
     )
-    # now add completely uninformative feature
-    X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4))))
-
-    # test for unimportant feature set
-    clf.reset()
-    if sample_dataset_per_tree and permute_forest_fraction is None:
-        # test in another test
-        return
-
-    stat, pvalue = clf.test(
+    result = build_coleman_forest(
+        clf,
+        perm_clf,
         X,
-        iris_y,
-        np.arange(iris_X.shape[0], X.shape[1], dtype=int).tolist(),
-        n_repeats=n_repeats,
-        metric="auc",
-    )
-    print(pvalue)
-    assert pvalue > 0.05, f"pvalue: {pvalue}"
-
-    # test for important features that are permuted
-    stat, pvalue = clf.test(X, iris_y, [0, 1, 2, 3], n_repeats=n_repeats, metric="auc")
-    print(pvalue)
-    assert pvalue < 0.05, f"pvalue: {pvalue}"
-
-    # one must call `reset()` to make sure the test is run on a "new" feature set properly
-    with pytest.raises(RuntimeError, match="X must have 8 features"):
-        clf.statistic(iris_X, iris_y, metric="auc", max_fpr=limit)
-
-    clf.reset()
-    score = clf.statistic(iris_X, iris_y, metric="auc", max_fpr=limit)
-    assert score >= 0.8, "Failed with pAUC: {0} for max fpr: {1}".format(score, limit)
-
-    assert isinstance(clf.estimator_, HonestForestClassifier)
-
-
-@pytest.mark.parametrize(
-    "forest_hyppo",
-    [
-        FeatureImportanceForestRegressor(
-            estimator=RandomForestRegressor(
-                n_estimators=10,
-            ),
-            random_state=seed,
-        ),
-        FeatureImportanceForestClassifier(
-            estimator=RandomForestClassifier(
-                n_estimators=10,
-            ),
-            random_state=seed,
-            permute_forest_fraction=None,
-            sample_dataset_per_tree=False,
-        ),
-    ],
-)
-def test_forestht_check_inputs(forest_hyppo):
-    n_samples = 100
-    n_features = 5
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
-    # Test case 1: Valid input
-    forest_hyppo.statistic(X, y)
-
-    # Test case 2: Invalid input with different number of samples
-    X_invalid = np.random.rand(n_samples + 1, X.shape[1])
-    y_invalid = rng.integers(0, 2, size=n_samples + 1)
-    with pytest.raises(RuntimeError, match="X must have"):
-        forest_hyppo.statistic(X_invalid, y_invalid)
-
-    # Test case 3: Invalid input with different number of features
-    X_invalid = np.random.rand(X.shape[0], n_features + 1)
-    with pytest.raises(RuntimeError, match="X must have"):
-        forest_hyppo.statistic(X_invalid, y)
-
-    # Test case 4: Invalid input with incorrect y type target
-    y_invalid = np.random.rand(X.shape[0])
-    with pytest.raises(RuntimeError, match="y must have type"):
-        forest_hyppo.statistic(X, y_invalid)
-
-
-@pytest.mark.parametrize("backend", ["loky", "threading"])
-@pytest.mark.parametrize("n_jobs", [1, 2])
-def test_parallelization(backend, n_jobs):
-    """Test parallelization of training forests."""
-    n_samples = 20
-    n_features = 5
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
-    def run_forest(covariate_index=None):
-        clf = FeatureImportanceForestClassifier(
-            estimator=HonestForestClassifier(
-                n_estimators=10, random_state=seed, n_jobs=n_jobs, honest_fraction=0.2
-            ),
-            test_size=0.5,
-        )
-        pvalue = clf.test(X, y, covariate_index=[covariate_index], metric="mi")
-        return pvalue
-
-    out = Parallel(n_jobs=-1, backend=backend)(
-        delayed(run_forest)(covariate_index) for covariate_index in range(n_features)
+        y,
+        covariate_index=[1, 2, 3, 4, 5],
+        n_repeats=1000,
+        metric="s@98",
+        return_posteriors=False,
+        seed=seed,
     )
-    assert len(out) == n_features
-
-
-def test_pickle(tmpdir):
-    """Test that pickling works and preserves fitted attributes."""
-    n_samples = 100
-    n_features = 5
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-    n_repeats = 1000
-
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
-        ),
-        test_size=0.5,
-    )
-    stat, pvalue = clf.test(X, y, covariate_index=[1], metric="mi", n_repeats=n_repeats)
-
-    with open(Path(tmpdir) / "clf.pkl", "wb") as fpath:
-        pickle.dump(clf, fpath)
-
-    with open(Path(tmpdir) / "clf.pkl", "rb") as fpath:
-        clf_pickle = pickle.load(fpath)
-
-    # recompute pvalue manually and compare
-    pickle_pvalue = (
-        1.0 + (clf_pickle.null_dist_ <= (clf_pickle.permute_stat_ - clf_pickle.observe_stat_)).sum()
-    ) / (1.0 + n_repeats)
-    assert pvalue == pickle_pvalue
-    assert clf_pickle.observe_stat_ == -(
-        stat - clf_pickle.permute_stat_
-    ), f"{clf_pickle.observe_stat_} != {-(stat - clf_pickle.permute_stat_)}"
-
-    attr_list = [
-        "test_size",
-        "observe_samples_",
-        "y_true_final_",
-        "observe_posteriors_",
-        "observe_stat_",
-        "_is_fitted",
-        "permute_samples_",
-        "permute_posteriors_",
-        "permute_stat_",
-        "n_samples_test_",
-        "_n_samples_",
-        "_metric",
-        "train_test_samples_",
-    ]
-    for attr in attr_list:
-        assert_array_equal(getattr(clf, attr), getattr(clf_pickle, attr))
-
-
-@pytest.mark.parametrize(
-    "permute_forest_fraction",
-    [None, 0.5],
-    ids=["no_permute", "permute_forest_fraction"],
-)
-@pytest.mark.parametrize(
-    "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"]
-)
-def test_sample_size_consistency_of_estimator_indices_(
-    permute_forest_fraction, sample_dataset_per_tree
-):
-    """Test that the test-sample indices are what is expected."""
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
-        ),
-        test_size=0.5,
-        permute_forest_fraction=permute_forest_fraction,
-        sample_dataset_per_tree=sample_dataset_per_tree,
-        stratify=False,
-    )
-
-    n_samples = 100
-    n_features = 5
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
-    _, posteriors, samples = clf.statistic(
-        X, y, covariate_index=None, return_posteriors=True, metric="mi"
-    )
-
-    if sample_dataset_per_tree or permute_forest_fraction is not None:
-        # check the non-nans
-        non_nan_idx = _non_nan_samples(posteriors)
-        if sample_dataset_per_tree:
-            assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}"
-
-        sorted_sample_idx = sorted(np.unique(samples))
-        sorted_est_samples_idx = sorted(
-            np.unique(np.concatenate([x[1] for x in clf.train_test_samples_]).flatten())
-        )
-        assert_array_equal(sorted_sample_idx, non_nan_idx)
-
-        # the sample indices are equal to the output of the train/test indices_
-        # only if there are no nans in the posteriors over all the samples
-        if np.sum(non_nan_idx) == n_samples:
-            assert_array_equal(
-                sorted_sample_idx,
-                sorted_est_samples_idx,
-                f"{set(sorted_sample_idx) - set(sorted_est_samples_idx)} and "
-                f"{set(sorted_est_samples_idx) - set(sorted_sample_idx)}",
-            )
-    else:
-        assert_array_equal(
-            samples,
-            sorted(clf.train_test_samples_[0][1]),
-            err_msg=f"Samples {set(samples) - set(sorted(clf.train_test_samples_[0][1]))}.",
-        )
-    assert len(_non_nan_samples(posteriors)) == len(samples)
-
-
-@pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
-@pytest.mark.parametrize("seed", [None, 0])
-def test_sample_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree):
-    n_samples = 100
-    n_features = 5
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-    n_estimators = 10
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2
-        ),
-        test_size=0.5,
-        permute_forest_fraction=1.0 / n_estimators,
-        sample_dataset_per_tree=sample_dataset_per_tree,
-    )
-    other_clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2
-        ),
-        test_size=0.5,
-        permute_forest_fraction=None,
-        sample_dataset_per_tree=sample_dataset_per_tree,
-    )
-
-    clf.statistic(X, y, covariate_index=None, metric="mi")
-    other_clf.statistic(X, y, covariate_index=None, metric="mi")
-
-    # estimator indices should be preserved over multiple calls
-    estimator_train_test_indices = deepcopy(clf.train_test_samples_)
-    for idx in range(clf.n_estimators):
-        assert_array_equal(clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0])
-        assert_array_equal(clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1])
-
-    # if the sample_dataset_per_tree, then the indices should be different across all trees
-    if sample_dataset_per_tree or clf.permute_forest_fraction > 0.0:
-        for indices, other_indices in combinations(clf.train_test_samples_, 2):
-            assert not np.array_equal(indices[0], other_indices[0])
-            assert not np.array_equal(indices[1], other_indices[1])
-    else:
-        for indices, other_indices in combinations(clf.train_test_samples_, 2):
-            assert_array_equal(indices[0], other_indices[0])
-            assert_array_equal(indices[1], other_indices[1])
-
-    # estimator indices should be preserved over multiple calls
-    estimator_train_test_indices = deepcopy(other_clf.train_test_samples_)
-    for idx in range(clf.n_estimators):
-        assert_array_equal(
-            other_clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0]
-        )
-        assert_array_equal(
-            other_clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1]
-        )
-
-        # when seed is passed, the indices should be deterministic
-        if seed is not None and sample_dataset_per_tree:
-            assert_array_equal(
-                clf.train_test_samples_[idx][0], other_clf.train_test_samples_[idx][0]
-            )
-            assert_array_equal(
-                clf.train_test_samples_[idx][1], other_clf.train_test_samples_[idx][1]
-            )
-
-
-@pytest.mark.parametrize("seed", [None, 0])
-def test_small_dataset_independent(seed):
-    n_samples = 32
-    n_features = 5
-
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5
-        ),
-        test_size=0.2,
-        permute_forest_fraction=None,
-        sample_dataset_per_tree=False,
-    )
-    stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
-    assert ~np.isnan(pvalue)
-    assert ~np.isnan(stat)
-    assert pvalue > 0.05
-
-    stat, pvalue = clf.test(X, y, metric="mi")
-    assert_almost_equal(stat, 0.0, decimal=1)
-    assert pvalue > 0.05
+    assert ~np.isnan(result.pvalue)
+    assert ~np.isnan(result.observe_test_stat)
+    assert result.pvalue > 0.05, f"{result.pvalue}"
 
 
 @flaky(max_runs=3)
-@pytest.mark.parametrize("seed", [None, 0])
+@pytest.mark.parametrize("seed", [10, 0])
 def test_small_dataset_dependent(seed):
-    n_samples = 100
+    n_samples = 50
     n_features = 5
     rng = np.random.default_rng(seed)
 
@@ -562,130 +129,43 @@ def test_small_dataset_dependent(seed):
         [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
     )  # Binary classification
 
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5
-        ),
-        test_size=0.2,
-        permute_forest_fraction=None,
-        sample_dataset_per_tree=False,
+    n_estimators = 100
+    clf = HonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=True,
+        max_samples=1.6,
     )
-    stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
-    assert ~np.isnan(pvalue)
-    assert ~np.isnan(stat)
-    assert pvalue <= 0.05
-
-    stat, pvalue = clf.test(X, y, metric="mi")
-    assert pvalue <= 0.05
-
-
-@flaky(max_runs=3)
-def test_no_traintest_split():
-    n_samples = 500
-    n_features = 5
-    rng = np.random.default_rng(seed)
-
-    X = rng.uniform(size=(n_samples, n_features))
-    X = rng.uniform(size=(n_samples // 2, n_features))
-    X2 = X * 2
-    X = np.vstack([X, X2])
-    y = np.vstack(
-        [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
-    )  # Binary classification
-
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=50,
-            max_features=n_features,
-            random_state=seed,
-            n_jobs=1,
-            honest_fraction=0.5,
-        ),
-        test_size=0.2,
-        train_test_split=False,
-        permute_forest_fraction=None,
-        sample_dataset_per_tree=False,
+    perm_clf = PermutationHonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=True,
+        max_samples=1.6,
     )
-    stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
-
-    # since no train-test split, the training is all the data and the testing is none of the data
-    assert_array_equal(clf.train_test_samples_[0][0], np.arange(n_samples))
-    assert_array_equal(clf.train_test_samples_[0][1], np.array([]))
-
-    assert ~np.isnan(pvalue)
-    assert ~np.isnan(stat)
-    assert pvalue <= 0.05, f"{pvalue}"
+    result = build_coleman_forest(
+        clf,
+        perm_clf,
+        X,
+        y,
+        covariate_index=[1, 2],
+        n_repeats=1000,
+        metric="mi",
+        return_posteriors=False,
+    )
+    assert ~np.isnan(result.pvalue)
+    assert ~np.isnan(result.observe_test_stat)
+    assert result.pvalue <= 0.05
 
-    stat, pvalue = clf.test(X, y, metric="mi")
-    assert pvalue <= 0.05, f"{pvalue}"
+    result = build_coleman_forest(clf, perm_clf, X, y, metric="mi", return_posteriors=False)
+    assert result.pvalue <= 0.05
 
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-    clf.reset()
-
-    stat, pvalue = clf.test(X, y, metric="mi")
-    assert_almost_equal(stat, 0.0, decimal=1)
-    assert pvalue > 0.05, f"{pvalue}"
-
-    stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
-    assert ~np.isnan(pvalue)
-    assert ~np.isnan(stat)
-    assert pvalue > 0.05, f"{pvalue}"
-
-
-@pytest.mark.parametrize("permute_forest_fraction", [1.0 / 10, 0.5, 0.75, 1.0])
-@pytest.mark.parametrize("seed", [None, 0])
-def test_permute_forest_fraction(permute_forest_fraction, seed):
-    """Test proper handling of random seeds, shuffled covariates and train/test splits."""
-    n_estimators = 10
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2
-        ),
-        test_size=0.5,
-        permute_forest_fraction=permute_forest_fraction,
-        stratify=False,
-    )
 
-    n_samples = 100
-    n_features = 5
-    X = rng.uniform(size=(n_samples, n_features))
-    y = rng.integers(0, 2, size=n_samples)  # Binary classification
-
-    _ = clf.statistic(X, y, covariate_index=None, return_posteriors=True, metric="mi")
-
-    seed = None
-    train_test_splits = list(clf.train_test_samples_)
-    train_inds = None
-    test_inds = None
-    for idx, tree in enumerate(clf.estimator_.estimators_):
-        # All random seeds of the meta-forest should be as expected, where
-        # the seed only changes depending on permute forest fraction
-        if idx % int(permute_forest_fraction * clf.n_estimators) == 0:
-            prev_seed = seed
-            seed = clf._seeds[idx]
-
-            assert seed == tree.random_state
-            assert prev_seed != seed
-        else:
-            assert seed == clf._seeds[idx], f"{seed} != {clf._seeds[idx]}"
-            assert seed == clf._seeds[idx - 1]
-
-        # Next, train/test splits should be consistent for batches of trees
-        if idx % int(permute_forest_fraction * clf.n_estimators) == 0:
-            prev_train_inds = train_inds
-            prev_test_inds = test_inds
-
-            train_inds, test_inds = train_test_splits[idx]
-
-            assert (prev_train_inds != train_inds).any(), f"{prev_train_inds} == {train_inds}"
-            assert (prev_test_inds != test_inds).any(), f"{prev_test_inds} == {test_inds}"
-        else:
-            assert_array_equal(train_inds, train_test_splits[idx][0])
-            assert_array_equal(test_inds, train_test_splits[idx][1])
-
-
-def test_comight_repeated_feature_sets():
+@pytest.mark.parametrize("seed", [10, 0])
+def test_comight_repeated_feature_sets(seed):
     """Test COMIGHT when there are repeated feature sets."""
     n_samples = 50
     n_features = 500
@@ -698,86 +178,58 @@ def test_comight_repeated_feature_sets():
     X = np.vstack([X, X2])
     y = np.vstack([np.zeros((n_samples, 1)), np.ones((n_samples, 1))])  # Binary classification
 
-    X = np.hstack((X, X))
+    X = np.hstack((X, X + rng.standard_normal(size=(n_samples * 2, n_features)) * 1e-5))
     feature_set_ends = [n_features, n_features * 2]
 
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=50,
-            random_state=seed,
-            n_jobs=1,
-            honest_fraction=0.5,
-            tree_estimator=MultiViewDecisionTreeClassifier(
-                feature_set_ends=feature_set_ends,
-                max_features=0.3,
-                apply_max_features_per_feature_set=True,
-            ),
+    n_estimators = 50
+    clf = HonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=True,
+        max_samples=1.6,
+        max_features=0.3,
+        stratify=True,
+        tree_estimator=MultiViewDecisionTreeClassifier(
+            feature_set_ends=feature_set_ends,
+            apply_max_features_per_feature_set=True,
         ),
-        test_size=0.2,
-        permute_forest_fraction=None,
-        sample_dataset_per_tree=False,
+    )
+    perm_clf = PermutationHonestForestClassifier(
+        n_estimators=n_estimators,
         random_state=seed,
+        n_jobs=-1,
+        honest_fraction=0.5,
+        bootstrap=True,
+        max_samples=1.6,
+        max_features=0.3,
+        stratify=True,
+        tree_estimator=MultiViewDecisionTreeClassifier(
+            feature_set_ends=feature_set_ends,
+            apply_max_features_per_feature_set=True,
+        ),
     )
 
     # first test MIGHT rejects the null, since there is information
-    stat, pvalue = clf.test(X, y, metric="mi")
-    assert pvalue < 0.05
+    result = build_coleman_forest(
+        clf, perm_clf, X, y, n_repeats=1000, metric="mi", return_posteriors=False
+    )
+    assert result.pvalue < 0.05
 
     # second test CoMIGHT fails to reject the null, since the information
     # is entirely contained in the first feature set
-    stat, pvalue = clf.test(X, y, covariate_index=np.arange(n_features), metric="mi")
-    assert pvalue > 0.05, f"{pvalue}"
-
-
-def test_null_with_partial_auc():
-    limit = 0.1
-    max_features = "sqrt"
-    n_repeats = 1000
-    n_estimators = 20
-    test_size = 0.2
-
-    # Check consistency on dataset iris.
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=n_estimators,
-            max_features=max_features,
-            random_state=0,
-            n_jobs=1,
-        ),
-        test_size=test_size,
-        random_state=0,
-    )
-    # now add completely uninformative feature
-    X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4))))
-
-    stat, pvalue = clf.test(
+    result = build_coleman_forest(
+        clf,
+        perm_clf,
         X,
-        iris_y,
-        covariate_index=np.arange(2),
-        n_repeats=n_repeats,
-        metric="auc",
-    )
-    first_null_dist = deepcopy(clf.null_dist_)
-
-    # If we re-run it with a different seed, but now specifying max_fpr
-    # there should be a difference in the partial-AUC distribution
-    clf = FeatureImportanceForestClassifier(
-        estimator=HonestForestClassifier(
-            n_estimators=n_estimators,
-            max_features=max_features,
-            random_state=0,
-            n_jobs=1,
-        ),
-        test_size=test_size,
-        random_state=seed,
-    )
-    stat, pvalue = clf.test(
-        X, iris_y, covariate_index=np.arange(2), n_repeats=n_repeats, metric="auc", max_fpr=limit
+        y,
+        n_repeats=1000,
+        covariate_index=np.arange(n_features),
+        metric="mi",
+        return_posteriors=False,
     )
-    second_null_dist = clf.null_dist_
-    null_dist_pvalue = wilcoxon(first_null_dist, second_null_dist).pvalue
-    assert null_dist_pvalue < 0.05, null_dist_pvalue
-    assert pvalue > 0.05, f"pvalue: {pvalue}"
+    assert result.pvalue > 0.05, f"{result.pvalue}"
 
 
 def test_build_coleman_forest():
diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
index 3ad9c065a..31232a38e 100644
--- a/sktree/tests/test_honest_forest.py
+++ b/sktree/tests/test_honest_forest.py
@@ -103,6 +103,9 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator):
     n_estimators = 10
 
     # Check consistency on dataset iris.
+    # Note: bootstrap is False here for backwards compatibility and making
+    # the unit-test pass. Since bootstrap is not the feature being tested
+    # here, this is fine.
     clf = HonestForestClassifier(
         criterion=criterion,
         random_state=0,
@@ -110,6 +113,7 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator):
         n_estimators=n_estimators,
         honest_prior=honest_prior,
         tree_estimator=estimator,
+        bootstrap=False,
     )
 
     second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)])
@@ -259,7 +263,7 @@ def test_honest_decision_function(honest_fraction, val):
 
 
 @parametrize_with_checks(
-    [HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)]
+    [HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0, bootstrap=False)]
 )
 def test_sklearn_compatible_estimator(estimator, check):
     # 1. check_class_weight_classifiers is not supported since it requires sample weight