From 3cfb0823d5d0e645208e63b75846bb8267a266c1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 11 Jun 2024 12:36:32 -0400 Subject: [PATCH] MAINT `HonestForestClassifier` now has `bootstrap=True` as the default. And removing old MIGHT code (#274) * Fix honest forest bootstrap option * Cleanup old class-based (CO)MIGHT code * Fix bug in `build_coleman_forest` where covariate_index was not being passed through --------- Signed-off-by: Adam Li --- doc/api.rst | 4 - doc/whats_new/v0.5.rst | 2 +- doc/whats_new/v0.6.rst | 2 +- doc/whats_new/v0.8.rst | 4 + examples/hypothesis_testing/README.txt | 6 - ...ot_MI_genuine_hypothesis_testing_forest.py | 132 -- .../plot_MI_imbalanced_hyppo_testing.py | 253 ---- .../plot_co_MIGHT_alternative.py | 220 --- .../hypothesis_testing/plot_co_MIGHT_null.py | 317 ----- examples/hypothesis_testing/plot_might_auc.py | 132 -- .../hypothesis_testing/plot_might_mv_auc.py | 135 -- sktree/ensemble/_honest_forest.py | 2 +- sktree/stats/__init__.py | 7 - sktree/stats/forestht.py | 1185 +---------------- sktree/stats/meson.build | 1 - sktree/stats/permutationforest.py | 439 ------ sktree/stats/tests/test_coleman.py | 207 ++- sktree/stats/tests/test_forestht.py | 842 ++---------- sktree/tests/test_honest_forest.py | 6 +- 19 files changed, 269 insertions(+), 3627 deletions(-) delete mode 100644 examples/hypothesis_testing/README.txt delete mode 100644 examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py delete mode 100644 examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py delete mode 100644 examples/hypothesis_testing/plot_co_MIGHT_alternative.py delete mode 100644 examples/hypothesis_testing/plot_co_MIGHT_null.py delete mode 100644 examples/hypothesis_testing/plot_might_auc.py delete mode 100644 examples/hypothesis_testing/plot_might_mv_auc.py delete mode 100644 sktree/stats/permutationforest.py diff --git a/doc/api.rst b/doc/api.rst index dd5ba611c..0c112b384 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -145,10 +145,6 @@ tree models. .. autosummary:: :toctree: generated/ - FeatureImportanceForestRegressor - FeatureImportanceForestClassifier - PermutationForestClassifier - PermutationForestRegressor build_coleman_forest build_permutation_forest build_oob_forest diff --git a/doc/whats_new/v0.5.rst b/doc/whats_new/v0.5.rst index ccb8892e0..c30e50086 100644 --- a/doc/whats_new/v0.5.rst +++ b/doc/whats_new/v0.5.rst @@ -26,7 +26,7 @@ Changelog handles the case where there is one feature view that is exhausted, and another that is not for ``apply_max_features_per_feature_set = False``, by `Adam Li`_ (:pr:`#183`). -- |Fix| :class:`sktree.stats.FeatureImportanceForestClassifier` now correctly passes +- |Fix| ``sktree.stats.FeatureImportanceForestClassifier`` now correctly passes metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`). Code and Documentation Contributors diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst index f12ce3a2a..79e3a5369 100644 --- a/doc/whats_new/v0.6.rst +++ b/doc/whats_new/v0.6.rst @@ -24,7 +24,7 @@ Changelog handles the case where there is one feature view that is exhausted, and another that is not for ``apply_max_features_per_feature_set = False``, by `Adam Li`_ (:pr:`#183`). -- |Fix| :class:`sktree.stats.FeatureImportanceForestClassifier` now correctly passes +- |Fix| ``sktree.stats.FeatureImportanceForestClassifier`` now correctly passes metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`). - |Enhancement| :func:`sktree.datasets.make_trunk_classification` now has a generative model based on Trunk and banded covariance, :func:`sktree.datasets.approximate_clf_mutual_information` and diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index f021d1412..7b31bea3f 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -21,6 +21,10 @@ Changelog - |Feature| Simulations in ``sktree.datasets.hyppo`` now throw a warning instead of an error when the number of samples is less than the number of dimensions. By `Sambit Panda`_ (:pr:`#279`) +- |API| :class:`sktree.HonestForestClassifier` now has ``bootstrap=True`` as the default + argument. By `Adam Li`_ (:pr:`#274`) +- |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated + MIGHT code. By `Adam Li`_ (:pr:`#274`) Code and Documentation Contributors ----------------------------------- diff --git a/examples/hypothesis_testing/README.txt b/examples/hypothesis_testing/README.txt deleted file mode 100644 index 9a60917ca..000000000 --- a/examples/hypothesis_testing/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -.. _hyppo_examples: - -Hypothesis testing with decision trees --------------------------------------- - -Examples demonstrating how to use decision-trees for statistical hypothesis testing. diff --git a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py deleted file mode 100644 index 2ddec2289..000000000 --- a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -========================================================= -Mutual Information for Genuine Hypothesis Testing (MIGHT) -========================================================= - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to test the hypothesis that a "feature set is important for -predicting the target". This is a generalization of the framework presented in -:footcite:`coleman2022scalable`. - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). Here, we are testing the -null hypothesis: - -- ``H0: I(X; y) - I(X, W; y) = 0`` -- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with - respect to ``y`` - -where ``I`` is mutual information. For example, this could be true in the following settings, -where X is our informative feature set and W is our uninformative feature set. - -- ``W X -> y``: here ``W`` is completely disconnected from X and y. -- ``W -> X -> y``: here ``W`` is d-separated from y given X. -- ``W <- X -> y``: here ``W`` is d-separated from y given X. - -We then use MIGHT to test the hypothesis that the first feature set is important for -predicting the target, and the second feature set is not important for predicting the -target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 2000 -n_features_set = 20 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) - -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) -X = np.hstack([X_important, X_unimportant]) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Perform hypothesis testing using Mutual Information -# --------------------------------------------------- -# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis -# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated -# between two forests. One forest is trained on the original dataset, and one forest is trained -# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. -# -# The null distribution is then estimated in an efficient manner using the framework of -# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) -# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then -# computed as the proportion of samples in the null distribution that are less than the -# observed test statistic. - -n_estimators = 100 -max_features = "sqrt" -test_size = 0.2 -n_repeats = 1000 -n_jobs = -1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.25, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, -) - -# we test for the first feature set, which is important and thus should return a pvalue < 0.05 -stat, pvalue = est.test( - X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats -) -print(f"Estimated MI difference for the important feature set: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 -stat, pvalue = est.test( - X, - y, - covariate_index=np.arange(n_features_set, dtype=int) + n_features_set, - metric="mi", - n_repeats=n_repeats, -) -print(f"Estimated MI difference for the unimportant feature set: {stat} with Pvalue: {pvalue}") - -# %% -# References -# ---------- -# .. footbibliography:: diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py deleted file mode 100644 index 95c5341ae..000000000 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ /dev/null @@ -1,253 +0,0 @@ -""" -============================================================================== -Mutual Information for Genuine Hypothesis Testing (MIGHT) with Imbalanced Data -============================================================================== - -Here, we demonstrate how to do hypothesis testing on highly imbalanced data -in terms of their feature-set dimensionalities. -using mutual information as a test statistic. We use the framework of -:footcite:`coleman2022scalable` to estimate pvalues efficiently. - -Here, we simulate two feature sets, one of which is important for the target, -but significantly smaller in dimensionality than the other feature set, which -is unimportant for the target. We then use the MIGHT framework to test for -the importance of each feature set. Instead of leveraging a normal honest random -forest to estimate the posteriors, here we leverage a multi-view honest random -forest, with knowledge of the multi-view structure of the ``X`` data. - -For other examples of hypothesis testing, see the following: - -- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_genuine_hypothesis_testing_forest.py` -- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py` - -For more information on the multi-view decision-tree, see -:ref:`sphx_glr_auto_examples_multiview_plot_multiview_dtc.py`. -""" - -import matplotlib.pyplot as plt -import numpy as np -from sklearn.datasets import make_blobs - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. -# -# Our data will follow the following graphical model: -# -# $(X_1 \rightarrow Y; X_2)$ -# -# where $X_1$ is our signal feature set, $X_2$ is our noise feature set, and $Y$ is our target. -# $X_1$ will be low-dimensional, but $X_2$ is high-dimensional noise. - -seed = 12345 -rng = np.random.default_rng(seed) - - -def make_multiview_classification( - n_samples=100, n_features_1=10, n_features_2=1000, cluster_std=2.0, seed=None -): - rng = np.random.default_rng(seed=seed) - - # Create a high-dimensional multiview dataset with a low-dimensional informative - # subspace in one view of the dataset. - X0_first, y0 = make_blobs( - n_samples=n_samples, - cluster_std=cluster_std, - n_features=n_features_1, - random_state=rng.integers(1, 10000), - centers=1, - center_box=(-2, 2.0), - ) - - X1_first, y1 = make_blobs( - n_samples=n_samples, - cluster_std=cluster_std, - n_features=n_features_1, - random_state=rng.integers(1, 10000), - centers=1, - center_box=(-2.0, 2.0), - ) - - y1[:] = 1 - - # add the second view for y=0 and y=1, which is completely noise - X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) - X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) - - # combine the views and targets - X = np.vstack((X0, X1)) - y = np.hstack((y0, y1)).T - - # add noise to the data - X = X + rng.standard_normal(size=X.shape) - - return X, y - - -n_samples = 100 -n_features_1 = 10 -n_features_2 = 10_000 -n_features_views = [n_features_1, n_features_1 + n_features_2] - -X, y = make_multiview_classification( - n_samples=n_samples, - n_features_1=n_features_1, - n_features_2=n_features_2, - cluster_std=5.0, - seed=seed, -) - -print(X.shape, y.shape, n_features_views) -# %% -# Perform hypothesis testing using Mutual Information -# --------------------------------------------------- -# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis -# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated -# between two forests. One forest is trained on the original dataset, and one forest is trained -# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. -# -# The null distribution is then estimated in an efficient manner using the framework of -# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) -# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then -# computed as the proportion of samples in the null distribution that are less than the -# observed test statistic. - -n_estimators = 100 -max_features = "sqrt" -test_size = 0.2 -n_repeats = 1000 -n_jobs = -1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=MultiViewDecisionTreeClassifier( - feature_set_ends=n_features_views, - apply_max_features_per_feature_set=True, - ), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, -) - -mv_results = dict() - -# we test for the overall MI of X vs y -stat, pvalue = est.test(X, y, metric="mi", n_repeats=n_repeats) -mv_results["feature_stat"] = stat -mv_results["feature_pvalue"] = pvalue -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# we test for the first feature set, which is important and thus should return a pvalue < 0.05 -stat, pvalue = est.test( - X, y, covariate_index=np.arange(n_features_1, dtype=int), metric="mi", n_repeats=n_repeats -) -mv_results["important_feature_stat"] = stat -mv_results["important_feature_pvalue"] = pvalue -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 -stat, pvalue = est.test( - X, - y, - covariate_index=np.arange(n_features_1, n_features_2, dtype=int), - metric="mi", - n_repeats=n_repeats, -) -mv_results["unimportant_feature_stat"] = stat -mv_results["unimportant_feature_pvalue"] = pvalue -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# %% -# Let's investigate what happens when we do not use a multi-view decision tree. -# All other parameters are kept the same. - -# to ensure max-features is the same across the two models -max_features = int(np.sqrt(n_features_1) + np.sqrt(n_features_2)) - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, -) - -rf_results = dict() - -# we test for the overall MI of X vs y -stat, pvalue = est.test(X, y, metric="mi", n_repeats=n_repeats) -rf_results["feature_stat"] = stat -rf_results["feature_pvalue"] = pvalue -print("\n\nAnalyzing regular decision tree models:") -print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") - -# we test for the first feature set, which is important and thus should return a pvalue < 0.05 -stat, pvalue = est.test( - X, y, covariate_index=np.arange(n_features_1, dtype=int), metric="mi", n_repeats=n_repeats -) -rf_results["important_feature_stat"] = stat -rf_results["important_feature_pvalue"] = pvalue -print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 -stat, pvalue = est.test( - X, - y, - covariate_index=np.arange(n_features_1, n_features_2, dtype=int), - metric="mi", - n_repeats=n_repeats, -) -rf_results["unimportant_feature_stat"] = stat -rf_results["unimportant_feature_pvalue"] = pvalue -print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") - -fig, ax = plt.subplots(figsize=(5, 3)) - -# plot pvalues -ax.bar(0, rf_results["important_feature_pvalue"], label="Permuting $X_1$ (RF)") -ax.bar(1, rf_results["unimportant_feature_pvalue"], label="Permuting $X_2$ (RF)") -ax.bar(2, mv_results["important_feature_pvalue"], label="Permuting $X_1$ (MV)") -ax.bar(3, mv_results["unimportant_feature_pvalue"], label="Permuting $X_2$ (MV)") -ax.bar(4, mv_results["feature_pvalue"], label="Overall Feature Set (MV)") -ax.bar(5, rf_results["feature_pvalue"], label="Overall Feature Set (RF)") -ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05") -ax.set(ylabel="Log10(PValue)", xlim=[-0.5, 5.5], yscale="log") -ax.legend() - -fig.tight_layout() -plt.show() - -# %% -# Discussion -# ---------- -# We see that the multi-view decision tree is able to detect the important feature set, -# while the regular decision tree is not. This is because the regular decision tree -# is not aware of the multi-view structure of the data, and thus is challenged -# by the imbalanced dimensionality of the feature sets. I.e. it rarely splits on -# the first low-dimensional feature set, and thus is unable to detect its importance. -# -# Note both approaches still fail to reject the null hypothesis (for alpha of 0.05) -# when testing the unimportant feature set. The difference in the two approaches -# show the statistical power of the multi-view decision tree is higher than the -# regular decision tree in this simulation. - -# %% -# References -# ---------- -# .. footbibliography:: diff --git a/examples/hypothesis_testing/plot_co_MIGHT_alternative.py b/examples/hypothesis_testing/plot_co_MIGHT_alternative.py deleted file mode 100644 index fd33c335e..000000000 --- a/examples/hypothesis_testing/plot_co_MIGHT_alternative.py +++ /dev/null @@ -1,220 +0,0 @@ -""" -==================================================================================== -Demonstrate Conditional Mutual Information for Genuine Hypothesis Testing (Co-MIGHT) -==================================================================================== - -In this example, we demonstrate how to test the conditional mutual information (CMI) -hypothesis test. To perform CMI testing, we have the hypothesis test: - -- $H_0: I(X_2; Y | X_1) = 0$ -- $H_1: I(X_2; Y | X_1) > 0$ - -Here, we simulate two feature-sets, which are both informative for the target. The -data-generating process follows the graphical model: - -$(X_1 \\rightarrow X_2 \\rightarrow Y; X_1 \\rightarrow Y)$ - -This means that $I(X_1; Y | X_2) > 0$ if we had a perfect estimate of CMI. - -We will demonstrate how to perform the CMI test properly using a conditional -permutation (compared to a standard permutation) of samples. We specifically -explore the case where the alternative hypothesis is true and determine when -the test is able to reject the null hypothesis correctly. - -.. note:: This does not exactly implement conditional-independence testing - yet, since we do not have a way to estimate the null distribution of - $I(X_2; Y | X_1)$. However, we can still demonstrate the power of the - test in the case where the alternative hypothesis is true. -""" - -import matplotlib.pyplot as plt -import numpy as np -from sklearn.datasets import make_spd_matrix - -from sktree import HonestForestClassifier -from sktree.datasets import make_gaussian_mixture -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. -seed = 12345 -rng = np.random.default_rng(seed) - -n_samples = 200 - -n_features = 20 -noise_dims = 80 -class_probs = [0.6, 0.4] -n_features_2 = 1000 - noise_dims - -fixed_center = rng.standard_normal(size=(n_features,)) -centers = [fixed_center, fixed_center] - -covariances = [ - make_spd_matrix(n_dim=n_features, random_state=seed), - make_spd_matrix(n_dim=n_features, random_state=seed + 123), -] - -Xs, y = make_gaussian_mixture( - centers, - covariances, - n_samples=n_samples, - noise=1.0, - noise_dims=noise_dims, - shuffle=True, - class_probs=class_probs, - random_state=seed, - transform="linear", -) - -second_X = Xs[0] -first_X = Xs[1] - -print(first_X.shape, second_X.shape) -X = np.hstack((first_X, second_X, rng.standard_normal(size=(n_samples, n_features_2 - n_features)))) -n_features_ends = [ - n_features + noise_dims, - n_features_2 + n_features + noise_dims * 2, -] - -print(X.shape, y.shape, n_features_ends) - -# %% -# Perform hypothesis testing -# -------------------------- -# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis -# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated -# between two forests. One forest is trained on the original dataset, and one forest is trained -# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. -# -# The null distribution is then estimated in an efficient manner using the framework of -# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) -# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then -# computed as the proportion of samples in the null distribution that are less than the -# observed test statistic. - -n_estimators = 200 -max_features = 0.3 -test_size = 0.2 -n_repeats = 1000 -n_jobs = -1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=MultiViewDecisionTreeClassifier( - feature_set_ends=n_features_ends, - apply_max_features_per_feature_set=True, - ), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - sample_dataset_per_tree=False, -) - -mv_results = dict() - -# we test for the first feature set, which is lower-dimensional -covariate_index = np.arange(n_features_ends[0], dtype=int) -stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats) - -mv_results["low_dim_feature_stat"] = stat -mv_results["low_dim_feature_pvalue"] = pvalue -print("Analysis with multi-view decision tree.") -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is higher-dimensional -covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int) -stat, pvalue = est.test( - X, - y, - covariate_index=covariate_index, - metric="mi", - n_repeats=n_repeats, -) -mv_results["high_dim_feature_stat"] = stat -mv_results["high_dim_feature_pvalue"] = pvalue -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") - -# %% -# Let's investigate what happens when we do not use a multi-view decision tree. -# All other parameters are kept the same. - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - sample_dataset_per_tree=False, -) - -rf_results = dict() - -# we test for the first feature set, which is lower-dimensional -covariate_index = np.arange(n_features_ends[0], dtype=int) -stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats) - -rf_results["low_dim_feature_stat"] = stat -rf_results["low_dim_feature_pvalue"] = pvalue -print("\n\nAnalysing now with a regular decision-tree") -print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is higher-dimensional -covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int) -stat, pvalue = est.test( - X, - y, - covariate_index=covariate_index, - metric="mi", - n_repeats=n_repeats, -) -rf_results["high_dim_feature_stat"] = stat -rf_results["high_dim_feature_pvalue"] = pvalue -print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") - -fig, ax = plt.subplots(figsize=(5, 3)) - -# plot pvalues -ax.bar(0, rf_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (RF)") -ax.bar(1, rf_results["high_dim_feature_pvalue"], label="High-dim Feature Set (RF)") -ax.bar(2, mv_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (MV)") -ax.bar(3, mv_results["high_dim_feature_pvalue"], label="High-dim Feature Set (MV)") -ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05") -ax.set(ylabel="Log10(PValue)", xlim=[-0.5, 3.5], yscale="log") -ax.legend() - -fig.tight_layout() -plt.show() - -# %% -# Discussion -# ---------- -# In this example, since both feature-sets are in informative for the target, the true -# answer should be reject the null hypothesis. We see that neither the regular decision -# tree, nor the multi-view decision tree is able to reject the null hypothesis correctly. -# -# However, if we permute the lower-dimensional feature-set, the multi-view decision tree -# has a lower pvalue than the regular decision tree, which allows us to correctly -# reject the null hypothesis at lower $\alpha$ levels. - -# %% -# References -# ---------- -# .. footbibliography:: diff --git a/examples/hypothesis_testing/plot_co_MIGHT_null.py b/examples/hypothesis_testing/plot_co_MIGHT_null.py deleted file mode 100644 index 2e6325cd1..000000000 --- a/examples/hypothesis_testing/plot_co_MIGHT_null.py +++ /dev/null @@ -1,317 +0,0 @@ -""" -==================================================== -Co-MIGHT when Data Exhibits Conditional Independence -==================================================== - -In this example, we demonstrate how to test the conditional mutual information (CMI) -hypothesis test using conditional mutual information for genuine hypothesis test (Co-MIGHT). -To perform CMI testing, we have the hypothesis test: - -- $H_0: I(X_2; Y | X_1) = 0$ -- $H_1: I(X_2; Y | X_1) > 0$ - -Here, we simulate two feature-sets, which follow the null-hypothesis with the specific -setting that $X_2 \\perp \{Y, X_1\}$. We will test using the multi-view decision -tree to verify that that the null hypothesis is not rejected. -""" - -import matplotlib.pyplot as plt -import numpy as np -from sklearn.datasets import make_classification - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 200 -n_features_1 = 20 -noise_dims = 80 -n_features_2 = 1000 - -signal_X, y = make_classification( - n_samples=n_samples, - n_features=n_features_1 + noise_dims, - n_informative=n_features_1, - n_redundant=50, - n_repeated=0, - n_classes=2, - class_sep=0.5, - flip_y=0.01, - shuffle=True, - random_state=seed, -) - -# model parameters -n_estimators = 200 -max_features = 0.3 -test_size = 0.2 -n_repeats = 1000 -n_jobs = -1 - -# %% -# Analysis when the null hypothesis is true -# ----------------------------------------- -# Let's now investigate what happens when the null hypothesis is true. We will simulate -# data from the graphical model: -# -# $(X_1 \\rightarrow Y; X_2)$ -# -# Here, we either have $X_1$ or $X_2$ informative for the target, but not both. We will -# then perform hypothesis testing using the same procedure as above. We will test the settings -# when the high-dimensional feature-set is informative for the target, and when the -# low-dimensional feature-set is informative for the target. - -# Make X_2 high-dimensional -n_features_ends = [n_features_1 + noise_dims, signal_X.shape[1]] -_X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, n_features_2)))) -X = _X.copy() -n_features_ends[1] = X.shape[1] - -print(X.shape, y.shape, n_features_ends) - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=MultiViewDecisionTreeClassifier( - feature_set_ends=n_features_ends, - apply_max_features_per_feature_set=True, - ), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - sample_dataset_per_tree=False, -) - -rf_est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, -) - -rf_results = dict() -mv_results = dict() - -# we test for the first feature set, which is lower-dimensional -covariate_index = np.arange(n_features_ends[0], dtype=int) -stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats) - -mv_results["low_dim_feature_stat"] = stat -mv_results["low_dim_feature_pvalue"] = pvalue -print("\n\nImportant feature-set is low-dimensional") -print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is higher-dimensional -covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int) -stat, pvalue = est.test( - X, - y, - covariate_index=covariate_index, - metric="mi", - n_repeats=n_repeats, -) -mv_results["high_dim_feature_stat"] = stat -mv_results["high_dim_feature_pvalue"] = pvalue -print( - f"Estimated MI difference testing second view (does not have dependency): " - f"{stat} with Pvalue: {pvalue}" -) - -# %% -# Now, we will compare with using a standard decision tree classifier as our base model. - -covariate_index = np.arange(n_features_ends[0], dtype=int) -stat, pvalue = rf_est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats) - -rf_results["low_dim_feature_stat"] = stat -rf_results["low_dim_feature_pvalue"] = pvalue -print("\n\nComparing with random forest.") -print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is higher-dimensional -covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int) -stat, pvalue = rf_est.test( - X, - y, - covariate_index=covariate_index, - metric="mi", - n_repeats=n_repeats, -) -rf_results["high_dim_feature_stat"] = stat -rf_results["high_dim_feature_pvalue"] = pvalue -print( - f"Estimated MI difference testing second view (does not have dependency): " - f"{stat} with Pvalue: {pvalue}" -) - -fig, ax = plt.subplots(figsize=(5, 3)) - -# plot pvalues -ax.bar(0, rf_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (RF)", color="black") -ax.bar(1, rf_results["high_dim_feature_pvalue"], label="High-dim Feature Set (RF)", color="gray") -ax.bar(2, mv_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (MV)", color="green") -ax.bar(3, mv_results["high_dim_feature_pvalue"], label="High-dim Feature Set (MV)", color="blue") -ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05") -ax.set( - ylabel="Log10(PValue)", - xlim=[-0.5, 3.5], - yscale="log", - title="Signal Feature-set is Low-dimensional", -) -ax.legend() - -fig.tight_layout() -plt.show() - -# %% -# Now, we will make the informative feature-set, $X_1$, high-dimensional -# and verify that the null hypothesis is not rejected still. -_X = np.hstack( - ( - signal_X, - rng.standard_normal(size=(n_samples, n_features_2 - signal_X.shape[1])), - rng.standard_normal(size=(n_samples, n_features_1 + noise_dims)), - ) -) -X = _X.copy() -n_features_ends = [n_features_2, X.shape[1]] -print("\n\nSetting important feature-set to be high-dimensional.") -print(X.shape, n_features_ends) - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=MultiViewDecisionTreeClassifier( - feature_set_ends=n_features_ends, - apply_max_features_per_feature_set=True, - ), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, -) - -rf_est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=0.5, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, -) - -mv_results = dict() -rf_results = dict() - -# we test for the first feature set, which is lower-dimensional -covariate_index = np.arange(n_features_ends[0], dtype=int) -stat, pvalue = est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats) - -mv_results["high_dim_feature_stat"] = stat -mv_results["high_dim_feature_pvalue"] = pvalue -print("\n\nImportant feature-set is high-dimensional") -print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is higher-dimensional -covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int) -stat, pvalue = est.test( - X, - y, - covariate_index=covariate_index, - metric="mi", - n_repeats=n_repeats, -) -mv_results["low_dim_feature_stat"] = stat -mv_results["low_dim_feature_pvalue"] = pvalue -print( - f"Estimated MI difference testing second view (does not have dependency): " - f"{stat} with Pvalue: {pvalue}" -) - -# %% -# Again, we compare to using a standard decision tree classifier as our base model. - -covariate_index = np.arange(n_features_ends[0], dtype=int) -stat, pvalue = rf_est.test(X, y, covariate_index=covariate_index, metric="mi", n_repeats=n_repeats) - -rf_results["low_dim_feature_stat"] = stat -rf_results["low_dim_feature_pvalue"] = pvalue -print("\n\nComparing with random forest.") -print(f"Estimated MI difference with first view (has dependency): {stat} with Pvalue: {pvalue}") - -# we test for the second feature set, which is higher-dimensional -covariate_index = np.arange(n_features_ends[0], n_features_ends[1], dtype=int) -stat, pvalue = rf_est.test( - X, - y, - covariate_index=covariate_index, - metric="mi", - n_repeats=n_repeats, -) -rf_results["high_dim_feature_stat"] = stat -rf_results["high_dim_feature_pvalue"] = pvalue -print( - f"Estimated MI difference testing second view (does not have dependency): " - f"{stat} with Pvalue: {pvalue}" -) - -fig, ax = plt.subplots(figsize=(5, 3)) - -# plot pvalues -ax.bar(0, rf_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (RF)", color="black") -ax.bar(1, rf_results["high_dim_feature_pvalue"], label="High-dim Feature Set (RF)", color="black") -ax.bar(2, mv_results["low_dim_feature_pvalue"], label="Low-dim Feature Set (MV)", color="green") -ax.bar(3, mv_results["high_dim_feature_pvalue"], label="High-dim Feature Set (MV)", color="green") -ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05") -ax.set( - ylabel="Log10(PValue)", - xlim=[-0.5, 3.5], - yscale="log", - title="Signal Feature-set is High-dimensional", -) -ax.legend() - -fig.tight_layout() -plt.show() - - -# %% -# Discussion -# ---------- -# We see that when the null hypothesis is true, the multi-view decision tree does not -# reject the null hypothesis. In addition, it rejects the null hypothesis when there is -# a dependency between the target and the feature-set even when the feature-set is -# higher-dimensionality. This is in contrast to the standard decision tree, which -# fails to reject the null hypothesis when the feature-set with signal is -# higher-dimensional. - -# %% -# References -# ---------- -# .. footbibliography:: diff --git a/examples/hypothesis_testing/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py deleted file mode 100644 index 74ea79abd..000000000 --- a/examples/hypothesis_testing/plot_might_auc.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -=================================================================================== -Compute partial AUC using Mutual Information for Genuine Hypothesis Testing (MIGHT) -=================================================================================== - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated datasets. Here, we present a simulation -of how MIGHT is used to evaluate how a "feature set is important for predicting the target". - -We simulate a dataset with 1000 features, 500 samples, and a binary class target -variable. Within each feature set, there is 500 features associated with one feature -set, and another 500 features associated with another feature set. One could think of -these for example as different datasets collected on the same patient in a biomedical setting. -The first feature set (X) is strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). - -We then use MIGHT to calculate the partial AUC of these sets. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_important = np.hstack( - [ - X_important, - rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) - ), - ] -) - -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Use partial AUC as test statistic -# --------------------------------- -# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``. - -n_estimators = 125 -max_features = "sqrt" -metric = "auc" -test_size = 0.2 -n_jobs = -1 -honest_fraction = 0.7 -max_fpr = 0.1 - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - random_state=seed, - honest_fraction=honest_fraction, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_forest_fraction=1.0 / n_estimators, - sample_dataset_per_tree=True, -) - -# we test for the first feature set, which is important and thus should return a higher AUC -stat, posterior_arr, samples = est.statistic( - X_important, - y, - metric=metric, - return_posteriors=True, - max_fpr=max_fpr, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# Repeat for the second feature set -# --------------------------------- -# This feature set has a smaller statistic, which is expected due to its weak correlation. - -stat, posterior_arr, samples = est.statistic( - X_unimportant, - y, - metric=metric, - return_posteriors=True, - max_fpr=max_fpr, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# All posteriors are saved within the model -# ----------------------------------------- -# Extract the results from the model variables anytime. You can save the model with ``pickle``. -# -# ASH-90 / Partial AUC: ``est.observe_stat_`` -# -# Observed Samples: ``est.observe_samples_`` -# -# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1) -# -# True Labels: ``est.y_true_final_`` diff --git a/examples/hypothesis_testing/plot_might_mv_auc.py b/examples/hypothesis_testing/plot_might_mv_auc.py deleted file mode 100644 index 2a492eb7c..000000000 --- a/examples/hypothesis_testing/plot_might_mv_auc.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -===================================================== -Compute partial AUC using multi-view MIGHT (MV-MIGHT) -===================================================== - -An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric -multivariate hypothesis test, on simulated mutli-view datasets. Here, we present -how to estimate partial AUROC from a multi-view feature set. - -We simulate a dataset with 510 features, 1000 samples, and a binary class target variable. -The first 10 features (X) are strongly correlated with the target, and the second -feature set (W) is weakly correlated with the target (y). - -We then use MV-MIGHT to calculate the partial AUC of these sets. -""" - -import numpy as np -from scipy.special import expit - -from sktree import HonestForestClassifier -from sktree.stats import FeatureImportanceForestClassifier -from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier - -seed = 12345 -rng = np.random.default_rng(seed) - -# %% -# Simulate data -# ------------- -# We simulate the two feature sets, and the target variable. We then combine them -# into a single dataset to perform hypothesis testing. - -n_samples = 1000 -n_features_set = 500 -mean = 1.0 -sigma = 2.0 -beta = 5.0 - -unimportant_mean = 0.0 -unimportant_sigma = 4.5 - -# first sample the informative features, and then the uniformative features -X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) -X_unimportant = rng.normal( - loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) -) -X = np.hstack([X_important, X_unimportant]) - -# simulate the binary target variable -y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) - -# %% -# Use partial AUC as test statistic -# --------------------------------- -# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``. - -n_estimators = 125 -max_features = "sqrt" -metric = "auc" -test_size = 0.2 -n_jobs = -1 -honest_fraction = 0.5 -max_fpr = 0.1 - -est_mv = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=MultiViewDecisionTreeClassifier(feature_set_ends=[10, 10 + n_features_set]), - honest_fraction=honest_fraction, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_forest_fraction=1.0 / n_estimators, - sample_dataset_per_tree=True, -) - -# we test with the multi-view setting, thus should return a higher AUC -stat, posterior_arr, samples = est_mv.statistic( - X, - y, - metric=metric, - return_posteriors=True, - max_fpr=max_fpr, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# Repeat without multi-view -# --------------------------------- -# This feature set has a smaller statistic, which is expected due to its lack of multi-view setting. - -est = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=DecisionTreeClassifier(), - honest_fraction=honest_fraction, - n_jobs=n_jobs, - ), - random_state=seed, - test_size=test_size, - permute_forest_fraction=1.0 / n_estimators, - sample_dataset_per_tree=True, -) - -stat, posterior_arr, samples = est.statistic( - X, - y, - metric=metric, - return_posteriors=True, - max_fpr=max_fpr, -) - -print(f"ASH-90 / Partial AUC: {stat}") -print(f"Shape of Observed Samples: {samples.shape}") -print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") - -# %% -# All posteriors are saved within the model -# ----------------------------------------- -# Extract the results from the model variables anytime. You can save the model with ``pickle``. -# -# ASH-90 / Partial AUC: ``est_mv.observe_stat_`` -# -# Observed Samples: ``est_mv.observe_samples_`` -# -# Tree Posteriors for the positive class: ``est_mv.observe_posteriors_`` -# (n_trees, n_samples_test, 1) -# -# True Labels: ``est_mv.y_true_final_`` diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index b1647b9ca..484fca881 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -402,7 +402,7 @@ def __init__( max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0.0, - bootstrap=False, + bootstrap=True, oob_score=False, n_jobs=None, random_state=None, diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py index 188694c0a..8ca7a11ea 100644 --- a/sktree/stats/__init__.py +++ b/sktree/stats/__init__.py @@ -1,20 +1,13 @@ from .forestht import ( - FeatureImportanceForestClassifier, - FeatureImportanceForestRegressor, build_coleman_forest, build_cv_forest, build_oob_forest, build_permutation_forest, ) from .monte_carlo import PermutationTest -from .permutationforest import PermutationForestClassifier, PermutationForestRegressor from .permuteforest import PermutationHonestForestClassifier __all__ = [ - "FeatureImportanceForestClassifier", - "FeatureImportanceForestRegressor", - "PermutationForestClassifier", - "PermutationForestRegressor", "PermutationTest", "build_cv_forest", "build_oob_forest", diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index b84b06973..a03a7dd73 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1,39 +1,19 @@ import threading from collections import namedtuple -from typing import Callable, Optional, Tuple, Union +from typing import Callable import numpy as np from joblib import Parallel, delayed from numpy.typing import ArrayLike -from sklearn.base import MetaEstimatorMixin, clone, is_classifier +from sklearn.base import clone from sklearn.ensemble._base import _partition_estimators -from sklearn.ensemble._forest import ForestClassifier as sklearnForestClassifier -from sklearn.ensemble._forest import ForestRegressor as sklearnForestRegressor from sklearn.model_selection import StratifiedKFold, train_test_split from sklearn.utils.multiclass import type_of_target -from sklearn.utils.validation import _is_fitted, check_X_y -from .._lib.sklearn.ensemble._forest import ( - ForestClassifier, - ForestRegressor, - RandomForestClassifier, - RandomForestRegressor, - _get_n_samples_bootstrap, - _parallel_build_trees, -) -from ..ensemble._honest_forest import HonestForestClassifier -from ..experimental import conditional_resample -from ..tree import DecisionTreeClassifier, DecisionTreeRegressor +from .._lib.sklearn.ensemble._forest import ForestClassifier from ..tree._classes import DTYPE from .permuteforest import PermutationHonestForestClassifier -from .utils import ( - METRIC_FUNCTIONS, - POSITIVE_METRICS, - POSTERIOR_FUNCTIONS, - REGRESSOR_METRICS, - _compute_null_distribution_coleman, - _non_nan_samples, -) +from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman def _parallel_predict_proba(predict_proba, X, indices_test): @@ -48,1156 +28,6 @@ def _parallel_predict_proba(predict_proba, X, indices_test): return prediction -def _parallel_build_trees_with_sepdata( - tree: Union[DecisionTreeClassifier, DecisionTreeRegressor], - n_trees: int, - idx: int, - indices_train: ArrayLike, - X: ArrayLike, - y: ArrayLike, - covariate_index, - bootstrap: bool, - max_samples, - sample_weight: Optional[ArrayLike] = None, - class_weight=None, - missing_values_in_feature_mask=None, - classes=None, - random_state=None, -): - """Parallel function to build trees and compute posteriors. - - This inherently assumes that the caller function defines the indices - for the training and testing data for each tree. - """ - rng = np.random.default_rng(random_state) - X_train = X[indices_train, :] - y_train = y[indices_train, ...] - - if bootstrap: - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X_train.shape[0], max_samples=max_samples - ) - else: - n_samples_bootstrap = None - - # XXX: this currently creates a copy of X_train on RAM, which is not ideal - # individual tree permutation of y labels - if covariate_index is not None: - indices = np.arange(X_train.shape[0], dtype=int) - # perform permutation of covariates - index_arr = rng.choice(indices, size=(X_train.shape[0], 1), replace=False, shuffle=True) - perm_X_cov = X_train[index_arr, covariate_index] - X_train[:, covariate_index] = perm_X_cov - - tree = _parallel_build_trees( - tree, - bootstrap, - X_train, - y_train, - sample_weight, - idx, - n_trees, - verbose=0, - class_weight=class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) - return tree - - -class BaseForestHT(MetaEstimatorMixin): - observe_samples_: ArrayLike - observe_posteriors_: ArrayLike - observe_stat_: float - permute_samples_: ArrayLike - permute_posteriors_: ArrayLike - permute_stat_: float - - def __init__( - self, - estimator=None, - random_state=None, - verbose=0, - test_size=0.2, - stratify=False, - conditional_perm=False, - sample_dataset_per_tree=False, - permute_forest_fraction=None, - train_test_split=True, - ): - self.estimator = estimator - self.random_state = random_state - self.verbose = verbose - self.test_size = test_size - self.stratify = stratify - self.conditional_perm = conditional_perm - - self.train_test_split = train_test_split - # XXX: possibly removing these parameters - self.sample_dataset_per_tree = sample_dataset_per_tree - self.permute_forest_fraction = permute_forest_fraction - - self.n_samples_test_ = None - self._n_samples_ = None - self._metric = None - self._covariate_index_cache_ = None - self._type_of_target_ = None - self.n_features_in_ = None - self._is_fitted = False - self._seeds = None - self._perm_seeds = None - - @property - def n_estimators(self): - try: - return self.estimator_.n_estimators - except Exception: - return self.permuted_estimator_.n_estimators - finally: - return self._get_estimator().n_estimators - - def reset(self): - class_attributes = dir(type(self)) - instance_attributes = dir(self) - - for attr_name in instance_attributes: - if attr_name.endswith("_") and attr_name not in class_attributes: - delattr(self, attr_name) - - self.n_samples_test_ = None - self._n_samples_ = None - self._covariate_index_cache_ = None - self._type_of_target_ = None - self._metric = None - self.n_features_in_ = None - self._is_fitted = False - self._seeds = None - self._y = None - - def _get_estimators_indices(self, stratifier=None, sample_separate=False): - indices = np.arange(self._n_samples_, dtype=int) - - # Get drawn indices along both sample and feature axes - rng = np.random.default_rng(self.estimator_.random_state) - - if self.permute_forest_fraction is None: - permute_forest_fraction = 0.0 - else: - permute_forest_fraction = self.permute_forest_fraction - - # TODO: consolidate how we "sample/permute" per subset of the forest - if self.sample_dataset_per_tree or permute_forest_fraction > 0.0: - # sample random seeds - if self._seeds is None: - self._seeds = [] - self._n_permutations = 0 - - for itree in range(self.n_estimators): - # For every N-trees that are defined by permute forest fraction - # we will sample a new seed appropriately - if itree % max(int(permute_forest_fraction * self.n_estimators), 1) == 0: - tree = self.estimator_.estimators_[itree] - if tree.random_state is None: - seed = rng.integers(low=0, high=np.iinfo(np.int32).max) - else: - seed = tree.random_state - - self._seeds.append(seed) - seeds = self._seeds - - for idx, tree in enumerate(self.estimator_.estimators_): - seed = seeds[idx] - - # Operations accessing random_state must be performed identically - # to those in `_parallel_build_trees()` - indices_train, indices_test = train_test_split( - indices, - test_size=self.test_size, - shuffle=True, - stratify=stratifier, - random_state=seed, - ) - - yield indices_train, indices_test - else: - if self._seeds is None: - if self.estimator_.random_state is None: - self._seeds = rng.integers(low=0, high=np.iinfo(np.int32).max) - else: - self._seeds = self.estimator_.random_state - - indices_train, indices_test = train_test_split( - indices, - shuffle=True, - test_size=self.test_size, - stratify=stratifier, - random_state=self._seeds, - ) - - for _ in range(self.estimator_.n_estimators): - yield indices_train, indices_test - - @property - def train_test_samples_(self): - """ - The subset of drawn samples for each base estimator. - - Returns a dynamically generated list of indices identifying - the samples used for fitting each member of the ensemble, i.e., - the in-bag samples. - - Note: the list is re-created at each call to the property in order - to reduce the object memory footprint by not storing the sampling - data. Thus fetching the property may be slower than expected. - """ - if self._n_samples_ is None: - raise RuntimeError("The estimator must be fitted before accessing this attribute.") - - # we are not train/test splitting, then - if not self.train_test_split: - return [ - (np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) - for _ in range(len(self.estimator_.estimators_)) - ] - - # Stratifier uses a cached _y attribute if available - stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None - - return [ - (indices_train, indices_test) - for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier) - ] - - def _statistic( - self, - estimator: ForestClassifier, - X: ArrayLike, - y: ArrayLike, - covariate_index: ArrayLike, - metric: str, - return_posteriors: bool, - **metric_kwargs, - ): - raise NotImplementedError("Subclasses should implement this!") - - def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None): - X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE) - if y.ndim != 2: - y = y.reshape(-1, 1) - - if covariate_index is not None: - if not isinstance(covariate_index, (list, tuple, np.ndarray)): - raise RuntimeError("covariate_index must be an iterable of integer indices") - else: - if not all(isinstance(idx, (np.integer, int)) for idx in covariate_index): - raise RuntimeError("Not all covariate_index are integer indices") - - if self.test_size * X.shape[0] < 5: - raise RuntimeError( - f"There are less than 5 testing samples used with " - f"test_size={self.test_size} for X ({X.shape})." - ) - - if self._n_samples_ is not None and X.shape[0] != self._n_samples_: - raise RuntimeError( - f"X must have {self._n_samples_} samples, got {X.shape[0]}. " - f"If running on a new dataset, call the 'reset' method." - ) - if self.n_features_in_ is not None and X.shape[1] != self.n_features_in_: - raise RuntimeError( - f"X must have {self.n_features_in_} features, got {X.shape[1]}. " - f"If running on a new dataset, call the 'reset' method." - ) - if self._type_of_target_ is not None and type_of_target(y) != self._type_of_target_: - raise RuntimeError( - f"y must have type {self._type_of_target_}, got {type_of_target(y)}. " - f"If running on a new dataset, call the 'reset' method." - ) - - if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier): - raise RuntimeError("Train test split must occur if not using honest forest classifier.") - - if self.permute_forest_fraction is not None and self.permute_forest_fraction < 0.0: - raise RuntimeError("permute_forest_fraction must be non-negative.") - - if ( - self.permute_forest_fraction is not None - and self.permute_forest_fraction * self.n_estimators < 1.0 - ): - raise RuntimeError( - "permute_forest_fraction must be greater than 1./n_estimators, " - f"got {self.permute_forest_fraction}." - ) - - return X, y, covariate_index - - def statistic( - self, - X: ArrayLike, - y: ArrayLike, - covariate_index: Optional[ArrayLike] = None, - metric="mi", - return_posteriors: bool = False, - check_input: bool = True, - **metric_kwargs, - ) -> Tuple[float, ArrayLike, ArrayLike]: - """Compute the test statistic. - - Parameters - ---------- - X : ArrayLike of shape (n_samples, n_features) - The data matrix. - y : ArrayLike of shape (n_samples, n_outputs) - The target matrix. - covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None. - metric : str, optional - The metric to compute, by default "mse". - return_posteriors : bool, optional - Whether or not to return the posteriors, by default False. - check_input : bool, optional - Whether or not to check the input, by default True. - **metric_kwargs : dict, optional - Additional keyword arguments to pass to the metric function. - - Returns - ------- - stat : float - The test statistic. - posterior_final : ArrayLike of shape (n_estimators, n_samples_final, n_outputs) or - (n_estimators, n_samples_final), optional - If ``return_posteriors`` is True, then the posterior probabilities of the - samples used in the final test. ``n_samples_final`` is equal to ``n_samples`` - if all samples are encountered in the test set of at least one tree in the - posterior computation. - samples : ArrayLike of shape (n_samples_final,), optional - The indices of the samples used in the final test. ``n_samples_final`` is - equal to ``n_samples`` if all samples are encountered in the test set of at - least one tree in the posterior computation. - """ - if check_input: - X, y, covariate_index = self._check_input(X, y, covariate_index) - - if self._n_samples_ is None: - self._n_samples_, self.n_features_in_ = X.shape - - # Infer type of target y - if self._type_of_target_ is None: - self._type_of_target_ = type_of_target(y) - - if covariate_index is None: - self.estimator_ = self._get_estimator() - estimator = self.estimator_ - else: - self.permuted_estimator_ = self._get_estimator() - estimator = self.permuted_estimator_ - - if not hasattr(self, "estimator_"): - self.estimator_ = self._get_estimator() - - # XXX: this can be improved as an extra fit can be avoided, by - # just doing error-checking - # and then setting the internal meta data structures - # first run a dummy fit on the samples to initialize the - # internal data structure of the forest - if is_classifier(self.estimator_): - _unique_y = [] - for axis in range(y.shape[1]): - _unique_y.append(np.unique(y[:, axis])) - unique_y = np.hstack(_unique_y) - if unique_y.ndim > 1 and unique_y.shape[1] == 1: - unique_y = unique_y.ravel() - X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) - self.estimator_.fit(X_dummy, unique_y) - else: - if y.ndim > 1 and y.shape[1] == 1: - self.estimator_.fit(X[:2], y[:2].ravel()) - else: - self.estimator_.fit(X[:2], y[:2]) - - # Store a cache of the y variable - if is_classifier(estimator): - self._y = y.copy() - - # # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking - # # and then setting the internal meta data structures - # # first run a dummy fit on the samples to initialize the - # # internal data structure of the forest - if not _is_fitted(estimator) and is_classifier(estimator): - _unique_y = [] - for axis in range(y.shape[1]): - _unique_y.append(np.unique(y[:, axis])) - unique_y = np.hstack(_unique_y) - if unique_y.ndim > 1 and unique_y.shape[1] == 1: - unique_y = unique_y.ravel() - X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) - estimator.fit(X_dummy, unique_y) - elif not _is_fitted(estimator): - if y.ndim > 1 and y.shape[1] == 1: - estimator.fit(X[:2], y[:2].ravel()) - else: - estimator.fit(X[:2], y[:2]) - - # sampling a separate train/test per tree - if self.sample_dataset_per_tree: - self.n_samples_test_ = self._n_samples_ - else: - # here we fix a training/testing dataset - test_size_ = int(self.test_size * self._n_samples_) - - # Fit each tree and compute posteriors with train test splits - self.n_samples_test_ = test_size_ - - if self._metric is not None and self._metric != metric: - raise RuntimeError( - f"Metric must be {self._metric}, got {metric}. " - f"If running on a new dataset, call the 'reset' method." - ) - self._metric = metric - - if not is_classifier(estimator) and metric not in REGRESSOR_METRICS: - raise RuntimeError( - f'Metric must be either "mse" or "mae" if using Regression, got {metric}' - ) - - if estimator.n_outputs_ > 1 and metric == "auc": - raise ValueError("AUC metric is not supported for multi-output") - - return self._statistic( - estimator, - X, - y, - covariate_index=covariate_index, - metric=metric, - return_posteriors=return_posteriors, - **metric_kwargs, - ) - - def test( - self, - X, - y, - covariate_index: Optional[ArrayLike] = None, - metric: str = "mi", - n_repeats: int = 1000, - return_posteriors: bool = True, - **metric_kwargs, - ): - """Perform hypothesis test using Coleman method. - - X is split into a training/testing split. Optionally, the covariate index - columns are shuffled. - - On the training dataset, two honest forests are trained and then the posterior - is estimated on the testing dataset. One honest forest is trained on the - permuted dataset and the other is trained on the original dataset. - - Finally, resample the posteriors of the two forests to compute the null - distribution of the statistics. - - Parameters - ---------- - X : ArrayLike of shape (n_samples, n_features) - The data matrix. - y : ArrayLike of shape (n_samples, n_outputs) - The target matrix. - covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, will shuffle all columns by - default (corresponding to None). - metric : str, optional - The metric to compute, by default "mse". - n_repeats : int, optional - Number of times to sample the null distribution, by default 1000. - return_posteriors : bool, optional - Whether or not to return the posteriors, by default True. - **metric_kwargs : dict, optional - Additional keyword arguments to pass to the metric function. - - Returns - ------- - stat : float - The test statistic. To compute the test statistic, take - ``permute_stat_`` and subtract ``observe_stat_``. - pval : float - The p-value of the test statistic. - """ - X, y, covariate_index = self._check_input(X, y, covariate_index) - - if not self._is_fitted: - # first compute the test statistic on the un-permuted data - observe_stat, observe_posteriors, observe_samples = self.statistic( - X, - y, - covariate_index=None, - metric=metric, - return_posteriors=return_posteriors, - check_input=False, - **metric_kwargs, - ) - else: - observe_samples = self.observe_samples_ - observe_posteriors = self.observe_posteriors_ - observe_stat = self.observe_stat_ - - if covariate_index is None: - covariate_index = np.arange(X.shape[1], dtype=int) - - # next permute the data - permute_stat, permute_posteriors, permute_samples = self.statistic( - X, - y, - covariate_index=covariate_index, - metric=metric, - return_posteriors=return_posteriors, - check_input=False, - **metric_kwargs, - ) - self.permute_stat_ = permute_stat - - # Note: at this point, both `estimator` and `permuted_estimator_` should - # have been fitted already, so we can now compute on the null by resampling - # the posteriors and computing the test statistic on the resampled posteriors - if self.sample_dataset_per_tree: - metric_star, metric_star_pi = _compute_null_distribution_coleman( - y_test=y, - y_pred_proba_normal=observe_posteriors, - y_pred_proba_perm=permute_posteriors, - metric=metric, - n_repeats=n_repeats, - seed=self.random_state, - **metric_kwargs, - ) - else: - # If not sampling a new dataset per tree, then we may either be - # permuting the covariate index per tree or per forest. If not permuting - # there is only one train and test split, so we can just use that - _, indices_test = self.train_test_samples_[0] - indices_test = observe_samples - y_test = y[indices_test, :] - y_pred_proba_normal = observe_posteriors[:, indices_test, :] - y_pred_proba_perm = permute_posteriors[:, indices_test, :] - - metric_star, metric_star_pi = _compute_null_distribution_coleman( - y_test=y_test, - y_pred_proba_normal=y_pred_proba_normal, - y_pred_proba_perm=y_pred_proba_perm, - metric=metric, - n_repeats=n_repeats, - seed=self.random_state, - **metric_kwargs, - ) - # metric^\pi - metric = observed test statistic, which under the - # null is normally distributed around 0 - observe_test_stat = permute_stat - observe_stat - - # metric^\pi_j - metric_j, which is centered at 0 - null_dist = metric_star_pi - metric_star - - # compute pvalue - if metric in POSITIVE_METRICS: - pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats) - else: - pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) - - if return_posteriors: - self.observe_posteriors_ = observe_posteriors - self.permute_posteriors_ = permute_posteriors - self.observe_samples_ = observe_samples - self.permute_samples_ = permute_samples - - self.null_dist_ = null_dist - return observe_test_stat, pvalue - - -class FeatureImportanceForestRegressor(BaseForestHT): - """Forest hypothesis testing with continuous `y` variable. - - Implements the algorithm described in :footcite:`coleman2022scalable`. - - The dataset is split into a training and testing dataset initially. Then there - are two forests that are trained: one on the original dataset, and one on the - permuted dataset. The dataset is either permuted once, or independently for - each tree in the permuted forest. The original test statistic is computed by - comparing the metric on both forests ``(metric_forest - metric_perm_forest)``. - - Then the output predictions are randomly sampled to recompute the test statistic - ``n_repeats`` times. The p-value is computed as the proportion of times the - null test statistic is greater than the original test statistic. - - Parameters - ---------- - estimator : object, default=None - Type of forest estimator to use. By default `None`, which defaults to - :class:`sklearn.ensemble.RandomForestRegressor`. - - random_state : int, RandomState instance or None, default=None - Controls both the randomness of the bootstrapping of the samples used - when building trees (if ``bootstrap=True``) and the sampling of the - features to consider when looking for the best split at each node - (if ``max_features < n_features``). - See :term:`Glossary ` for details. - - verbose : int, default=0 - Controls the verbosity when fitting and predicting. - - test_size : float, default=0.2 - Proportion of samples per tree to use for the test set. - - sample_dataset_per_tree : bool, default=False - Whether to sample the dataset per tree or per forest. - - conditional_perm : bool, default=False - Whether or not to conditionally permute the covariate index. If True, - then the covariate index is permuted while preserving the joint with respect - to the rest of the covariates. - - permute_forest_fraction : float, default=None - The fraction of trees to permute the covariate index for. If None, then - just one permutation is performed. If sampling a permutation per tree - is desirable, then the fraction should be set to ``1. / n_estimators``. - - train_test_split : bool, default=True - Whether to split the dataset before passing to the forest. - - Attributes - ---------- - estimator_ : BaseForest - The estimator used to compute the test statistic. - - n_samples_test_ : int - The number of samples used in the final test set. - - indices_train_ : ArrayLike of shape (n_samples_train,) - The indices of the samples used in the training set. - - indices_test_ : ArrayLike of shape (n_samples_test,) - The indices of the samples used in the testing set. - - samples_ : ArrayLike of shape (n_samples_final,) - The indices of the samples used in the final test set that would slice - the original ``(X, y)`` input. - - y_true_final_ : ArrayLike of shape (n_samples_final,) - The true labels of the samples used in the final test. - - observe_posteriors_ : ArrayLike of shape (n_estimators, n_samples, n_outputs) or - (n_estimators, n_samples, n_classes) - The predicted posterior probabilities of the samples used in the final test. - For samples that are NaNs for all estimators, means the sample was not used - in the test set at all across all trees. - - null_dist_ : ArrayLike of shape (n_repeats,) - The null distribution of the test statistic. - - Notes - ----- - This class trains two forests: one on the original dataset, and one on the - permuted dataset. The forest from the original dataset is cached and re-used to - compute the test-statistic each time the :meth:`test` method is called. However, - the forest from the permuted dataset is re-trained each time the :meth:`test` is called - if the ``covariate_index`` differs from the previous run. - - To fully start from a new dataset, call the ``reset`` method, which will then - re-train both forests upon calling the :meth:`test` and :meth:`statistic` methods. - - References - ---------- - .. footbibliography:: - """ - - def __init__( - self, - estimator=None, - random_state=None, - verbose=0, - test_size=0.2, - sample_dataset_per_tree=False, - conditional_perm=False, - permute_forest_fraction=None, - train_test_split=True, - ): - super().__init__( - estimator=estimator, - random_state=random_state, - verbose=verbose, - test_size=test_size, - sample_dataset_per_tree=sample_dataset_per_tree, - conditional_perm=conditional_perm, - permute_forest_fraction=permute_forest_fraction, - train_test_split=train_test_split, - ) - - def _get_estimator(self): - if self.estimator is None: - estimator_ = RandomForestRegressor() - elif not isinstance(self.estimator, (ForestRegressor, sklearnForestRegressor)): - raise RuntimeError(f"Estimator must be a ForestRegressor, got {type(self.estimator)}") - else: - estimator_ = self.estimator - return clone(estimator_) - - def statistic( - self, - X: ArrayLike, - y: ArrayLike, - covariate_index: Optional[ArrayLike] = None, - metric="mse", - return_posteriors: bool = False, - check_input: bool = True, - **metric_kwargs, - ): - """Compute the test statistic. - - Parameters - ---------- - X : ArrayLike of shape (n_samples, n_features) - The data matrix. - y : ArrayLike of shape (n_samples, n_outputs) - The target matrix. - covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None. - metric : str, optional - The metric to compute, by default "mse". - return_posteriors : bool, optional - Whether or not to return the posteriors, by default False. - check_input : bool, optional - Whether or not to check the input, by default True. - **metric_kwargs : dict, optional - Additional keyword arguments to pass to the metric function. - - Returns - ------- - stat : float - The test statistic. - posterior_final : ArrayLike of shape (n_estimators, n_samples_final, n_outputs) or - (n_estimators, n_samples_final), optional - If ``return_posteriors`` is True, then the posterior probabilities of the - samples used in the final test. ``n_samples_final`` is equal to ``n_samples`` - if all samples are encountered in the test set of at least one tree in the - posterior computation. - samples : ArrayLike of shape (n_samples_final,), optional - The indices of the samples used in the final test. ``n_samples_final`` is - equal to ``n_samples`` if all samples are encountered in the test set of at - least one tree in the posterior computation. - """ - return super().statistic( - X, y, covariate_index, metric, return_posteriors, check_input, **metric_kwargs - ) - - def _statistic( - self, - estimator: ForestClassifier, - X: ArrayLike, - y: ArrayLike, - covariate_index: ArrayLike, - metric: str, - return_posteriors: bool, - **metric_kwargs, - ): - """Helper function to compute the test statistic.""" - metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] - rng = np.random.default_rng(self.random_state) - - posterior_arr = np.full((self.n_estimators, self._n_samples_, estimator.n_outputs_), np.nan) - - # both sampling dataset per tree or permuting per tree requires us to bypass the - # sklearn API to fit each tree individually - if self.sample_dataset_per_tree or self.permute_forest_fraction: - if self.permute_forest_fraction and covariate_index is not None: - random_states = [tree.random_state for tree in estimator.estimators_] - else: - random_states = [estimator.random_state] * len(estimator.estimators_) - - trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_parallel_build_trees_with_sepdata)( - estimator.estimators_[idx], - len(estimator.estimators_), - idx, - indices_train, - X, - y, - covariate_index, - bootstrap=estimator.bootstrap, - max_samples=estimator.max_samples, - random_state=random_states[idx], - ) - for idx, (indices_train, _) in enumerate(self.train_test_samples_) - ) - estimator.estimators_ = trees - else: - # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_[0] - - X_train, _ = X[indices_train, :], X[indices_test, :] - y_train, _ = y[indices_train, :], y[indices_test, :] - - if covariate_index is not None: - # perform permutation of covariates - if self.conditional_perm: - X_perm_cov_ind = conditional_resample( - X_train, X_train[:, covariate_index], replace=False, random_state=rng - ) - X_train[:, covariate_index] = X_perm_cov_ind - else: - n_samples_train = X_train.shape[0] - index_arr = rng.choice( - np.arange(n_samples_train, dtype=int), - size=(n_samples_train, 1), - replace=False, - shuffle=True, - ) - X_train[:, covariate_index] = X_train[index_arr, covariate_index] - - if self._type_of_target_ == "binary": - y_train = y_train.ravel() - estimator.fit(X_train, y_train) - - # TODO: probably a more elegant way of doing this - if self.train_test_split: - # accumulate the predictions across all trees - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)( - estimator.estimators_[idx].predict, X, indices_test - ) - for idx, (_, indices_test) in enumerate(self.train_test_samples_) - ) - for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): - _, indices_test = est_indices - posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) - else: - all_indices = np.arange(self._n_samples_, dtype=int) - - # accumulate the predictions across all trees - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, all_indices) - for idx in range(len(estimator.estimators_)) - ) - for itree, proba in enumerate(all_proba): - posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_outputs_) - - # determine if there are any nans in the final posterior array, when - # averaged over the trees - samples = _non_nan_samples(posterior_arr) - - # Ignore all NaN values (samples not tested) - y_true_final = y[(samples), :] - - # Average all posteriors (n_samples_test, n_outputs) to compute the statistic - posterior_forest = np.nanmean(posterior_arr[:, (samples), :], axis=0) - stat = metric_func(y_true_final, posterior_forest, **metric_kwargs) - if covariate_index is None: - # Ignore all NaN values (samples not tested) -> (n_samples_final, n_outputs) - # arrays of y and predicted posterior - self.observe_samples_ = samples - self.y_true_final_ = y_true_final - self.observe_posteriors_ = posterior_arr - self.observe_stat_ = stat - self._is_fitted = True - - if return_posteriors: - return stat, posterior_arr, samples - - return stat - - -class FeatureImportanceForestClassifier(BaseForestHT): - """Forest hypothesis testing with categorical `y` variable. - - Implements the algorithm described in :footcite:`coleman2022scalable`. - - The dataset is split into a training and testing dataset initially. Then there - are two forests that are trained: one on the original dataset, and one on the - permuted dataset. The dataset is either permuted once, or independently for - each tree in the permuted forest. The original test statistic is computed by - comparing the metric on both forests ``(metric_forest - metric_perm_forest)``. - - Then the output predictions are randomly sampled to recompute the test statistic - ``n_repeats`` times. The p-value is computed as the proportion of times the - null test statistic is greater than the original test statistic. - - Parameters - ---------- - estimator : object, default=None - Type of forest estimator to use. By default `None`, which defaults to - :class:`sklearn.ensemble.RandomForestRegressor`. - - random_state : int, RandomState instance or None, default=None - Controls both the randomness of the bootstrapping of the samples used - when building trees (if ``bootstrap=True``) and the sampling of the - features to consider when looking for the best split at each node - (if ``max_features < n_features``). - See :term:`Glossary ` for details. - - verbose : int, default=0 - Controls the verbosity when fitting and predicting. - - test_size : float, default=0.2 - Proportion of samples per tree to use for the test set. - - stratify : bool, default=True - Whether to stratify the samples by class labels. - - conditional_perm : bool, default=False - Whether or not to conditionally permute the covariate index. If True, - then the covariate index is permuted while preserving the joint with respect - to the rest of the covariates. - - sample_dataset_per_tree : bool, default=False - Whether to sample the dataset per tree or per forest. - - permute_forest_fraction : float, default=None - The fraction of trees to permute the covariate index for. If None, then - just one permutation is performed. - - train_test_split : bool, default=True - Whether to split the data into train/test before passing to the forest. - - Attributes - ---------- - estimator_ : BaseForest - The estimator used to compute the test statistic. - - n_samples_test_ : int - The number of samples used in the final test set. - - indices_train_ : ArrayLike of shape (n_samples_train,) - The indices of the samples used in the training set. - - indices_test_ : ArrayLike of shape (n_samples_test,) - The indices of the samples used in the testing set. - - samples_ : ArrayLike of shape (n_samples_final,) - The indices of the samples used in the final test set that would slice - the original ``(X, y)`` input along the rows. - - y_true_final_ : ArrayLike of shape (n_samples_final,) - The true labels of the samples used in the final test. - - observe_posteriors_ : ArrayLike of shape (n_estimators, n_samples_final, n_outputs) or - (n_estimators, n_samples_final, n_classes) - The predicted posterior probabilities of the samples used in the final test. - - null_dist_ : ArrayLike of shape (n_repeats,) - The null distribution of the test statistic. - - Notes - ----- - This class trains two forests: one on the original dataset, and one on the - permuted dataset. The forest from the original dataset is cached and re-used to - compute the test-statistic each time the :meth:`test` method is called. However, - the forest from the permuted dataset is re-trained each time the :meth:`test` is called - if the ``covariate_index`` differs from the previous run. - - To fully start from a new dataset, call the ``reset`` method, which will then - re-train both forests upon calling the :meth:`test` and :meth:`statistic` methods. - - References - ---------- - .. footbibliography:: - """ - - def __init__( - self, - estimator=None, - random_state=None, - verbose=0, - test_size=0.2, - stratify=True, - conditional_perm=False, - sample_dataset_per_tree=False, - permute_forest_fraction=None, - train_test_split=True, - ): - super().__init__( - estimator=estimator, - random_state=random_state, - verbose=verbose, - test_size=test_size, - sample_dataset_per_tree=sample_dataset_per_tree, - stratify=stratify, - conditional_perm=conditional_perm, - train_test_split=train_test_split, - permute_forest_fraction=permute_forest_fraction, - ) - - def _get_estimator(self): - if self.estimator is None: - estimator_ = RandomForestClassifier() - elif not isinstance(self.estimator, (ForestClassifier, sklearnForestClassifier)): - raise RuntimeError(f"Estimator must be a ForestClassifier, got {type(self.estimator)}") - else: - # self.estimator is an instance of a ForestEstimator - estimator_ = self.estimator - return clone(estimator_) - - def _statistic( - self, - estimator: ForestClassifier, - X: ArrayLike, - y: ArrayLike, - covariate_index: ArrayLike, - metric: str, - return_posteriors: bool, - **metric_kwargs, - ): - """Helper function to compute the test statistic.""" - metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] - rng = np.random.default_rng(estimator.random_state) - - if metric in POSTERIOR_FUNCTIONS: - predict_posteriors = True - else: - predict_posteriors = False - - if predict_posteriors: - # now initialize posterior array as (n_trees, n_samples_test, n_classes) - # XXX: currently assumes n_outputs_ == 1 - posterior_arr = np.full( - (self.n_estimators, self._n_samples_, estimator.n_classes_), np.nan - ) - else: - # now initialize posterior array as (n_trees, n_samples_test, n_outputs) - posterior_arr = np.full( - (self.n_estimators, self._n_samples_, estimator.n_outputs_), np.nan - ) - - # both sampling dataset per tree or permuting per tree requires us to bypass the - # sklearn API to fit each tree individually - if self.sample_dataset_per_tree or self.permute_forest_fraction: - if self.permute_forest_fraction and covariate_index is not None: - random_states = [tree.random_state for tree in estimator.estimators_] - else: - random_states = [estimator.random_state] * len(estimator.estimators_) - - trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_parallel_build_trees_with_sepdata)( - estimator.estimators_[idx], - len(estimator.estimators_), - idx, - indices_train, - X, - y, - covariate_index, - bootstrap=estimator.bootstrap, - max_samples=estimator.max_samples, - random_state=random_states[idx], - ) - for idx, (indices_train, _) in enumerate(self.train_test_samples_) - ) - estimator.estimators_ = trees - else: - # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_[0] - - X_train, _ = X[indices_train, :], X[indices_test, :] - y_train = y[indices_train, :] - - if covariate_index is not None: - # perform permutation of covariates - if self.conditional_perm: - X_perm_cov_ind = conditional_resample( - X_train, X_train[:, covariate_index], replace=False, random_state=rng - ) - X_train[:, covariate_index] = X_perm_cov_ind - else: - n_samples_train = X_train.shape[0] - index_arr = rng.choice( - np.arange(n_samples_train, dtype=int), - size=(n_samples_train, 1), - replace=False, - shuffle=True, - ) - X_train[:, covariate_index] = X_train[index_arr, covariate_index] - - if self._type_of_target_ == "binary" or (y.ndim > 1 and y.shape[1] == 1): - y_train = y_train.ravel() - estimator.fit(X_train, y_train) - - # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,) - if predict_posteriors: - # all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - # delayed(_parallel_predict_proba)( - # estimator.estimators_[idx].predict_proba, X, indices_test - # ) - # for idx, (_, indices_test) in enumerate(self.train_test_samples_) - # ) - - # TODO: probably a more elegant way of doing this - if self.train_test_split: - # accumulate the predictions across all trees - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)( - estimator.estimators_[idx].predict_proba, X, indices_test - ) - for idx, (_, indices_test) in enumerate(self.train_test_samples_) - ) - else: - all_indices = np.arange(self._n_samples_, dtype=int) - - # accumulate the predictions across all trees - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)( - estimator.estimators_[idx].predict_proba, X, all_indices - ) - for idx in range(len(estimator.estimators_)) - ) - else: - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)( - estimator.estimators_[idx].predict, X, indices_test - ) - for idx, (_, indices_test) in enumerate(self.train_test_samples_) - ) - for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): - _, indices_test = est_indices - - if predict_posteriors: - if self.train_test_split: - posterior_arr[itree, indices_test, ...] = proba.reshape( - -1, estimator.n_classes_ - ) - else: - posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_classes_) - else: - posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) - - if metric == "auc": - # at this point, posterior_final is the predicted posterior for only the positive class - # as more than one output is not supported. - if self._type_of_target_ == "binary": - posterior_arr = posterior_arr[..., (1,)] - else: - raise RuntimeError( - f"AUC metric is not supported for {self._type_of_target_} targets." - ) - - # determine if there are any nans in the final posterior array, when - # averaged over the trees - samples = _non_nan_samples(posterior_arr) - - # Ignore all NaN values (samples not tested) - y_true_final = y[(samples), :] - - # Average all posteriors (n_samples_test, n_outputs) to compute the statistic - posterior_forest = np.nanmean(posterior_arr[:, (samples), :], axis=0) - stat = metric_func(y_true_final, posterior_forest, **metric_kwargs) - - if covariate_index is None: - # Ignore all NaN values (samples not tested) -> (n_samples_final, n_outputs) - # arrays of y and predicted posterior - self.observe_samples_ = samples - self.y_true_final_ = y_true_final - self.observe_posteriors_ = posterior_arr - self.observe_stat_ = stat - self._is_fitted = True - - if return_posteriors: - return stat, posterior_arr, samples - - return stat - - def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock): """ This is a utility function for joblib's Parallel. @@ -1253,7 +83,8 @@ def build_coleman_forest( y : ArrayLike of shape (n_samples, n_outputs) Binary target, so ``n_outputs`` should be at most 1. covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None. + The index array of covariates to shuffle, by default None, which + defaults to all covariates. metric : str, optional The metric to compute, by default "s@98", for sensitivity at 98% specificity. @@ -1300,7 +131,9 @@ def build_coleman_forest( raise RuntimeError( f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" ) - perm_est, perm_forest_proba = build_oob_forest(perm_est, X, y, verbose=verbose) + perm_est, perm_forest_proba = build_oob_forest( + perm_est, X, y, verbose=verbose, covariate_index=covariate_index + ) # get the number of jobs n_jobs = est.n_jobs diff --git a/sktree/stats/meson.build b/sktree/stats/meson.build index b99353c80..cd5472386 100644 --- a/sktree/stats/meson.build +++ b/sktree/stats/meson.build @@ -2,7 +2,6 @@ python_sources = [ '__init__.py', 'forestht.py', 'utils.py', - 'permutationforest.py', 'monte_carlo.py', 'permuteforest.py', ] diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py deleted file mode 100644 index 075874b12..000000000 --- a/sktree/stats/permutationforest.py +++ /dev/null @@ -1,439 +0,0 @@ -from typing import Optional - -import numpy as np -from numpy.typing import ArrayLike -from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.ensemble._forest import ForestClassifier as sklearnForestClassifier -from sklearn.ensemble._forest import ForestRegressor as sklearnForestRegressor -from sklearn.model_selection import train_test_split -from sklearn.utils.validation import check_X_y - -from sktree._lib.sklearn.ensemble._forest import ForestClassifier, ForestRegressor - -from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm - - -class BasePermutationForest(MetaEstimatorMixin): - def __init__( - self, - estimator=None, - test_size=0.2, - random_state=None, - verbose=0, - ): - self.estimator = estimator - self.test_size = test_size - self.random_state = random_state - self.verbose = verbose - - def reset(self): - class_attributes = dir(type(self)) - instance_attributes = dir(self) - - for attr_name in instance_attributes: - if attr_name.endswith("_") and attr_name not in class_attributes: - delattr(self, attr_name) - - def _get_estimator(self): - pass - - @property - def train_test_samples_(self): - """ - The subset of drawn samples for each base estimator. - - Returns a dynamically generated list of indices identifying - the samples used for fitting each member of the ensemble, i.e., - the in-bag samples. - - Note: the list is re-created at each call to the property in order - to reduce the object memory footprint by not storing the sampling - data. Thus fetching the property may be slower than expected. - """ - indices = np.arange(self._n_samples_, dtype=int) - - # Get drawn indices along both sample and feature axes - indices_train, indices_test = train_test_split( - indices, test_size=self.test_size, shuffle=True, random_state=self.random_state - ) - return indices_train, indices_test - - def _statistic( - self, - estimator: BaseEstimator, - X: ArrayLike, - y: ArrayLike, - covariate_index: Optional[ArrayLike] = None, - metric="mse", - return_posteriors: bool = False, - seed=None, - **metric_kwargs, - ): - """Helper function to compute the test statistic.""" - metric_func = METRIC_FUNCTIONS[metric] - if seed is None: - rng = np.random.default_rng(self.random_state) - else: - rng = np.random.default_rng(seed) - indices_train, indices_test = self.train_test_samples_ - if covariate_index is not None: - n_samples = X.shape[0] - indices = np.arange(n_samples, dtype=int) - # perform permutation of covariates - index_arr = rng.choice(indices, size=(n_samples, 1), replace=False, shuffle=False) - X = X.copy() - X[:, covariate_index] = X[index_arr, covariate_index] - - X_train, X_test = X[indices_train, :], X[indices_test, :] - y_train, y_test = y[indices_train, :], y[indices_test, :] - if y_train.shape[1] == 1: - y_train = y_train.ravel() - y_test = y_test.ravel() - estimator.fit(X_train, y_train) - - # Either get the predicted value, or the posterior probabilities - y_pred = estimator.predict(X_test) - - # set variables to compute metric - samples = indices_test - y_true_final = y_test - posterior_final = y_pred - - stat = metric_func(y_true_final, posterior_final, **metric_kwargs) - - if covariate_index is None: - # Ignore all NaN values (samples not tested) -> (n_samples_final, n_outputs) - # arrays of y and predicted posterior - self.samples_ = samples - self.y_true_ = y_true_final - self.posterior_final_ = posterior_final - self.stat_ = stat - - if return_posteriors: - return stat, posterior_final, samples - - return stat - - def statistic( - self, - X: ArrayLike, - y: ArrayLike, - covariate_index: Optional[ArrayLike] = None, - metric="mse", - return_posteriors: bool = False, - check_input: bool = True, - seed=None, - **metric_kwargs, - ): - """Compute the test statistic. - - Parameters - ---------- - X : ArrayLike of shape (n_samples, n_features) - The data matrix. - y : ArrayLike of shape (n_samples, n_outputs) - The target matrix. - covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None. - metric : str, optional - The metric to compute, by default "mse". - return_posteriors : bool, optional - Whether or not to return the posteriors, by default False. - check_input : bool, optional - Whether or not to check the input, by default True. - seed : int, optional - The random seed to use, by default None. - **metric_kwargs : dict, optional - Keyword arguments to pass to the metric function. - - Returns - ------- - stat : float - The test statistic. - posterior_final : ArrayLike of shape (n_samples_final, n_outputs), optional - If ``return_posteriors`` is True, then the posterior probabilities of the - samples used in the final test. ``n_samples_final`` is equal to ``n_samples`` - if all samples are encountered in the test set of at least one tree in the - posterior computation. - samples : ArrayLike of shape (n_samples_final,), optional - The indices of the samples used in the final test. ``n_samples_final`` is - equal to ``n_samples`` if all samples are encountered in the test set of at - least one tree in the posterior computation. - """ - if check_input: - X, y = check_X_y(X, y, ensure_2d=True, multi_output=True) - if y.ndim != 2: - y = y.reshape(-1, 1) - - self._n_samples_ = X.shape[0] - self.estimator_ = self._get_estimator() - - if is_classifier(self.estimator_): - if metric not in METRIC_FUNCTIONS: - raise RuntimeError( - f"Metric must be one of {list(METRIC_FUNCTIONS.keys())}, got {metric}" - ) - else: - if metric not in REGRESSOR_METRICS: - raise RuntimeError(f'Metric must be either "mse" or "mae", got {metric}') - - if covariate_index is None: - estimator = self.estimator_ - else: - self.permuted_estimator_ = clone(self.estimator_) - estimator = self.permuted_estimator_ - - return self._statistic( - estimator, - X, - y, - covariate_index=covariate_index, - metric=metric, - return_posteriors=return_posteriors, - seed=seed, - **metric_kwargs, - ) - - def test( - self, - X: ArrayLike, - y: ArrayLike, - covariate_index: ArrayLike, - metric: str = "mse", - n_repeats: int = 1000, - return_posteriors: bool = False, - **metric_kwargs, - ): - """Perform hypothesis test using permutation testing. - - Parameters - ---------- - X : ArrayLike of shape (n_samples, n_features) - The data matrix. - y : ArrayLike of shape (n_samples, n_outputs) - The target matrix. - covariate_index : ArrayLike of shape (n_covariates,) - The covariate indices of ``X`` to shuffle. - metric : str, optional - Metric to compute, by default "mse". - n_repeats : int, optional - Number of times to sample the null distribution, by default 1000. - return_posteriors : bool, optional - Whether or not to return the posteriors, by default False. - **metric_kwargs : dict, optional - Keyword arguments to pass to the metric function. - - Returns - ------- - observe_stat : float - Observed test statistic. - pvalue : float - Pvalue of the test. - """ - X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True) - if y.ndim != 2: - y = y.reshape(-1, 1) - self._n_samples_ = X.shape[0] - - # train/test split - # XXX: could add stratifying by y when y is classification - indices_train, indices_test = self.train_test_samples_ - - if not hasattr(self, "samples_"): - # first compute the test statistic on the un-permuted data - observe_stat, _, _ = self.statistic( - X, - y, - covariate_index=None, - metric=metric, - return_posteriors=True, - check_input=False, - **metric_kwargs, - ) - else: - # observe_samples = self.samples_ - # observe_posteriors = self.posterior_final_ - observe_stat = self.stat_ - - # compute null distribution of the test statistic - # WARNING: this could take a long time, since it fits a new forest - null_dist = _compute_null_distribution_perm( - X_train=X[indices_train, :], - y_train=y[indices_train, :], - X_test=X[indices_test, :], - y_test=y[indices_test, :], - covariate_index=covariate_index, - est=self.estimator_, - metric=metric, - n_repeats=n_repeats, - seed=self.random_state, - ) - - if not return_posteriors: - self.null_dist_ = np.array(null_dist) - else: - self.null_dist_ = np.array([x[0] for x in null_dist]) - self.posterior_null_ = np.array([x[1] for x in null_dist]) - - n_repeats = len(self.null_dist_) - pvalue = (1 + (self.null_dist_ < observe_stat).sum()) / (1 + n_repeats) - return observe_stat, pvalue - - -class PermutationForestRegressor(BasePermutationForest): - """Hypothesis testing of covariates with a permutation forest regressor. - - This implements permutation testing of a null hypothesis using a random forest. - The null hypothesis is generated by permuting ``n_repeats`` times the covariate - indices and then a random forest is trained for each permuted instance. This - is compared to the original random forest that was computed on the regular - non-permuted data. - - .. warning:: Permutation testing with forests is computationally expensive. - As a result, if you are testing for the importance of feature sets, consider - using `sktree.FeatureImportanceForestRegressor` or - `sktree.FeatureImportanceForestClassifier` instead, which is - much more computationally efficient. - - .. note:: This does not allow testing on the posteriors. - - Parameters - ---------- - estimator : object, default=None - Type of forest estimator to use. By default `None`, which defaults to - :class:`sklearn.ensemble.RandomForestRegressor` with default parameters. - - test_size : float, default=0.2 - The proportion of samples to leave out for each tree to compute metric on. - - random_state : int, RandomState instance or None, default=None - Controls both the randomness of the bootstrapping of the samples used - when building trees (if ``bootstrap=True``) and the sampling of the - features to consider when looking for the best split at each node - (if ``max_features < n_features``). - See :term:`Glossary ` for details. - - verbose : int, default=0 - Controls the verbosity when fitting and predicting. - - Attributes - ---------- - samples_ : ArrayLike of shape (n_samples,) - The indices of the samples used in the final test. - - y_true_ : ArrayLike of shape (n_samples_final,) - The true labels of the samples used in the final test. - - posterior_ : ArrayLike of shape (n_samples_final, n_outputs) - The predicted posterior probabilities of the samples used in the final test. - - null_dist_ : ArrayLike of shape (n_repeats,) - The null distribution of the test statistic. - - posterior_null_ : ArrayLike of shape (n_samples_final, n_outputs, n_repeats) - The posterior probabilities of the samples used in the final test for each - permutation for the null distribution. - """ - - def __init__( - self, - estimator=None, - test_size=0.2, - random_state=None, - verbose=0, - ): - super().__init__( - estimator=estimator, - test_size=test_size, - random_state=random_state, - verbose=verbose, - ) - - def _get_estimator(self): - if not hasattr(self, "estimator_") and self.estimator is None: - estimator_ = RandomForestRegressor() - elif not isinstance(self.estimator, (ForestRegressor, sklearnForestRegressor)): - raise RuntimeError(f"Estimator must be a ForestRegressor, got {type(self.estimator)}") - else: - estimator_ = self.estimator - return estimator_ - - -class PermutationForestClassifier(BasePermutationForest): - """Hypothesis testing of covariates with a permutation forest classifier. - - This implements permutation testing of a null hypothesis using a random forest. - The null hypothesis is generated by permuting ``n_repeats`` times the covariate - indices and then a random forest is trained for each permuted instance. This - is compared to the original random forest that was computed on the regular - non-permuted data. - - .. warning:: Permutation testing with forests is computationally expensive. - As a result, if you are testing for the importance of feature sets, consider - using `sktree.FeatureImportanceForestRegressor` or - `sktree.FeatureImportanceForestClassifier` instead, which is - much more computationally efficient. - - .. note:: This does not allow testing on the posteriors. - - Parameters - ---------- - estimator : object, default=None - Type of forest estimator to use. By default `None`, which defaults to - :class:`sklearn.ensemble.RandomForestClassifier`. - - test_size : float, default=0.2 - The proportion of samples to leave out for each tree to compute metric on. - - random_state : int, RandomState instance or None, default=None - Controls both the randomness of the bootstrapping of the samples used - when building trees (if ``bootstrap=True``) and the sampling of the - features to consider when looking for the best split at each node - (if ``max_features < n_features``). - See :term:`Glossary ` for details. - - verbose : int, default=0 - Controls the verbosity when fitting and predicting. - - Attributes - ---------- - samples_ : ArrayLike of shape (n_samples,) - The indices of the samples used in the final test. - - y_true_ : ArrayLike of shape (n_samples_final,) - The true labels of the samples used in the final test. - - posterior_ : ArrayLike of shape (n_samples_final, n_outputs) - The predicted posterior probabilities of the samples used in the final test. - - null_dist_ : ArrayLike of shape (n_repeats,) - The null distribution of the test statistic. - - posterior_null_ : ArrayLike of shape (n_samples_final, n_outputs, n_repeats) - The posterior probabilities of the samples used in the final test for each - permutation for the null distribution. - """ - - def __init__( - self, - estimator=None, - test_size=0.2, - random_state=None, - verbose=0, - ): - super().__init__( - estimator=estimator, - test_size=test_size, - random_state=random_state, - verbose=verbose, - ) - - def _get_estimator(self): - if not hasattr(self, "estimator_") and self.estimator is None: - estimator_ = RandomForestClassifier() - elif not isinstance(self.estimator, (ForestClassifier, sklearnForestClassifier)): - raise RuntimeError(f"Estimator must be a ForestClassifier, got {type(self.estimator)}") - else: - estimator_ = self.estimator - return estimator_ diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index e7b1ac2e9..53b2ae073 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -10,67 +10,61 @@ from flaky import flaky from scipy.special import expit -from sktree import RandomForestClassifier, RandomForestRegressor -from sktree.stats import ( - FeatureImportanceForestClassifier, - FeatureImportanceForestRegressor, - PermutationForestClassifier, - PermutationForestRegressor, -) - seed = 12345 rng = np.random.default_rng(seed) +# TODO: implement tests using the new MIGHT functional framework +@pytest.mark.skip() @flaky(max_runs=3) @pytest.mark.slowtest -@pytest.mark.parametrize( - "hypotester, model_kwargs, n_samples, n_repeats, test_size", - [ - [ - PermutationForestRegressor, - { - "estimator": RandomForestRegressor( - max_features="sqrt", - n_estimators=75, - n_jobs=-1, - ), - "random_state": seed, - }, - 300, - 50, - 0.1, - ], - [ - FeatureImportanceForestRegressor, - { - "estimator": RandomForestRegressor( - max_features="sqrt", - n_estimators=125, - n_jobs=-1, - ), - }, - 300, # n_samples - 1000, # n_repeats - 0.2, # test_size - ], - # XXX: Currently does not work with permute and sample dataset per tree - [ - FeatureImportanceForestRegressor, - { - "estimator": RandomForestRegressor( - max_features="sqrt", - n_estimators=250, - n_jobs=-1, - ), - "permute_forest_fraction": 0.5, - }, - 300, # n_samples - 1000, # n_repeats - 0.2, # test_size - ], - ], -) +# @pytest.mark.parametrize( +# "hypotester, model_kwargs, n_samples, n_repeats, test_size", +# [ +# [ +# PermutationForestRegressor, +# { +# "estimator": RandomForestRegressor( +# max_features="sqrt", +# n_estimators=75, +# n_jobs=-1, +# ), +# "random_state": seed, +# }, +# 300, +# 50, +# 0.1, +# ], +# [ +# FeatureImportanceForestRegressor, +# { +# "estimator": RandomForestRegressor( +# max_features="sqrt", +# n_estimators=125, +# n_jobs=-1, +# ), +# }, +# 300, # n_samples +# 1000, # n_repeats +# 0.2, # test_size +# ], +# # XXX: Currently does not work with permute and sample dataset per tree +# [ +# FeatureImportanceForestRegressor, +# { +# "estimator": RandomForestRegressor( +# max_features="sqrt", +# n_estimators=250, +# n_jobs=-1, +# ), +# "permute_forest_fraction": 0.5, +# }, +# 300, # n_samples +# 1000, # n_repeats +# 0.2, # test_size +# ], +# ], +# ) def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size): r"""Test hypothesis testing forests using MSE from linear model simulation. @@ -117,59 +111,60 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) assert pvalue > 0.05, f"pvalue: {pvalue}" +@pytest.mark.skip() @flaky(max_runs=3) @pytest.mark.slowtest -@pytest.mark.parametrize( - "hypotester, model_kwargs, n_samples, n_repeats, test_size", - [ - [ - PermutationForestClassifier, - { - "estimator": RandomForestClassifier( - max_features="sqrt", - n_estimators=50, - n_jobs=-1, - ), - "random_state": seed, - }, - 600, - 50, - 1.0 / 6, - ], - [ - # XXX: Currently does not work with permute and sample dataset per tree - FeatureImportanceForestClassifier, - { - "estimator": RandomForestClassifier( - max_features="sqrt", - n_estimators=100, - n_jobs=-1, - ), - "sample_dataset_per_tree": False, - }, - 600, # n_samples - 1000, # n_repeats - 1.0 / 6, # test_size - ], - # XXX: Currently does not work with permute and sample dataset per tree - [ - FeatureImportanceForestClassifier, - { - "estimator": RandomForestClassifier( - max_features="sqrt", - n_estimators=100, - n_jobs=-1, - random_state=rng.integers(0, 1000), - ), - "permute_forest_fraction": 0.5, - "random_state": rng.integers(0, 1000), - }, - 600, # n_samples - 1000, # n_repeats - 1.0 / 6, # test_size - ], - ], -) +# @pytest.mark.parametrize( +# "hypotester, model_kwargs, n_samples, n_repeats, test_size", +# [ +# [ +# PermutationForestClassifier, +# { +# "estimator": RandomForestClassifier( +# max_features="sqrt", +# n_estimators=50, +# n_jobs=-1, +# ), +# "random_state": seed, +# }, +# 600, +# 50, +# 1.0 / 6, +# ], +# [ +# # XXX: Currently does not work with permute and sample dataset per tree +# FeatureImportanceForestClassifier, +# { +# "estimator": RandomForestClassifier( +# max_features="sqrt", +# n_estimators=100, +# n_jobs=-1, +# ), +# "sample_dataset_per_tree": False, +# }, +# 600, # n_samples +# 1000, # n_repeats +# 1.0 / 6, # test_size +# ], +# # XXX: Currently does not work with permute and sample dataset per tree +# [ +# FeatureImportanceForestClassifier, +# { +# "estimator": RandomForestClassifier( +# max_features="sqrt", +# n_estimators=100, +# n_jobs=-1, +# random_state=rng.integers(0, 1000), +# ), +# "permute_forest_fraction": 0.5, +# "random_state": rng.integers(0, 1000), +# }, +# 600, # n_samples +# 1000, # n_repeats +# 1.0 / 6, # test_size +# ], +# ], +# ) def test_correlated_logit_model(hypotester, model_kwargs, n_samples, n_repeats, test_size): r"""Test MIGHT using MSE from linear model simulation. diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 848440620..e20de8ea8 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -1,27 +1,16 @@ -import pickle -from copy import deepcopy -from itertools import combinations -from pathlib import Path - import numpy as np import pytest from flaky import flaky -from joblib import Parallel, delayed from numpy.testing import assert_almost_equal, assert_array_equal -from scipy.stats import wilcoxon from sklearn import datasets -from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor -from sktree._lib.sklearn.tree import DecisionTreeClassifier +from sktree import HonestForestClassifier from sktree.stats import ( - FeatureImportanceForestClassifier, - FeatureImportanceForestRegressor, PermutationHonestForestClassifier, build_coleman_forest, build_permutation_forest, ) -from sktree.stats.utils import _non_nan_samples -from sktree.tree import MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier +from sktree.tree import MultiViewDecisionTreeClassifier # load the iris dataset (n_samples, 4) # and randomly permute it @@ -38,519 +27,97 @@ iris_y = iris_y[p] -@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) -def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): - n_samples = 50 - n_estimators = 10 - est = FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=n_estimators, - random_state=seed, - ), - permute_forest_fraction=1.0 / n_estimators * 5, - test_size=0.7, - random_state=seed, - sample_dataset_per_tree=sample_dataset_per_tree, - ) - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") - - assert ( - len(est.train_test_samples_[0][1]) == n_samples * est.test_size - ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}" - assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size - - est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mse") - assert ( - len(est.train_test_samples_[0][1]) == n_samples * est.test_size - ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}" - assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size - - # covariate index should work with mse - est.reset() - est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse") - with pytest.raises(RuntimeError, match="Metric must be"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi") - - # covariate index should work with mse - est.reset() - est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse") - with pytest.raises(RuntimeError, match="Metric must be"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi") - - # covariate index must be an iterable - est.reset() - with pytest.raises(RuntimeError, match="covariate_index must be an iterable"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi") - - # covariate index must be an iterable of ints - est.reset() - with pytest.raises(RuntimeError, match="Not all covariate_index"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") +@pytest.mark.parametrize("seed", [10, 0]) +def test_small_dataset_independent(seed): + # XXX: unit test interestingly does not work for MI, possibly due to bias + bootstrap = True + n_samples = 100 + n_features = 1000 + n_estimators = 100 - with pytest.raises( - RuntimeError, match="permute_forest_fraction must be greater than 1./n_estimators" - ): - est = FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=n_estimators, - random_state=seed, - ), - permute_forest_fraction=1.0 / n_samples, - test_size=0.7, - random_state=seed, - sample_dataset_per_tree=sample_dataset_per_tree, - ) - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") - - with pytest.raises(RuntimeError, match="permute_forest_fraction must be non-negative."): - est = FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=n_estimators, - random_state=seed, - ), - permute_forest_fraction=-1.0 / n_estimators * 5, - test_size=0.7, - random_state=seed, - sample_dataset_per_tree=sample_dataset_per_tree, - ) - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") - - -@pytest.mark.parametrize("covariate_index", [None, [0, 1]]) -def test_featureimportance_forest_statistic_with_covariate_index(covariate_index): - """Tests that calling `est.statistic` with covariate_index defined works. - - There should be no issue calling `est.statistic` with covariate_index defined. - """ - n_estimators = 10 - n_samples = 10 + rng = np.random.default_rng(seed) + X = rng.standard_normal(size=(n_samples, n_features)) + y = np.vstack( + (np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))) + ) # Binary classification - est = FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=n_estimators, - random_state=seed, - ), - permute_forest_fraction=1.0 / n_estimators * 5, - test_size=0.7, + clf = HonestForestClassifier( + n_estimators=n_estimators, random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=bootstrap, + max_samples=1.6, + max_features=0.3, + stratify=True, ) - est.statistic( - iris_X[:n_samples], iris_y[:n_samples], covariate_index=covariate_index, metric="mi" - ) - - -@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) -def test_featureimportance_forest_stratified(sample_dataset_per_tree): - n_samples = 100 - n_estimators = 10 - est = FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=n_estimators, - random_state=seed, - ), - permute_forest_fraction=1.0 / n_estimators * 5, - test_size=0.7, + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, random_state=seed, - sample_dataset_per_tree=sample_dataset_per_tree, - ) - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") - - _, indices_test = est.train_test_samples_[0] - y_test = iris_y[indices_test] - - assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), ( - f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" + n_jobs=-1, + honest_fraction=0.5, + bootstrap=bootstrap, + max_samples=1.6, + max_features=0.3, + stratify=True, ) - - est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") - - _, indices_test = est.train_test_samples_[0] - y_test = iris_y[indices_test] - - assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), ( - f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" + result = build_coleman_forest( + clf, perm_clf, X, y, n_repeats=1000, metric="s@98", return_posteriors=False, seed=seed ) + print(result.observe_stat, result.permuted_stat, result.pvalue, result.observe_test_stat) + assert_almost_equal(np.abs(result.observe_test_stat), 0.0, decimal=1) + assert result.pvalue > 0.05, f"{result.pvalue}" - -def test_featureimportance_forest_errors(): - sample_dataset_per_tree = True - n_estimators = 10 - est = FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=n_estimators, + # now permute only some of the features + feature_set_ends = [3, n_features] + clf = HonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=bootstrap, + max_features=0.3, + max_samples=1.6, + stratify=True, + tree_estimator=MultiViewDecisionTreeClassifier( + feature_set_ends=feature_set_ends, + apply_max_features_per_feature_set=True, ), - test_size=0.5, - permute_forest_fraction=None, - sample_dataset_per_tree=sample_dataset_per_tree, ) - with pytest.raises(RuntimeError, match="The estimator must be fitted"): - est.train_test_samples_ - - with pytest.raises(RuntimeError, match="There are less than 5 testing samples"): - est.statistic(iris_X[:5], iris_y[:5]) - - est = FeatureImportanceForestClassifier(estimator=RandomForestRegressor, test_size=0.5) - with pytest.raises(RuntimeError, match="Estimator must be"): - est.statistic(iris_X[:20], iris_y[:20]) - - est = FeatureImportanceForestRegressor(estimator=RandomForestClassifier, test_size=0.5) - with pytest.raises(RuntimeError, match="Estimator must be"): - est.statistic(iris_X[:20], iris_y[:20]) - - -@flaky(max_runs=2) -@pytest.mark.parametrize("criterion", ["gini", "entropy"]) -@pytest.mark.parametrize("honest_prior", ["empirical", "uniform"]) -@pytest.mark.parametrize( - "estimator", - [ - None, - DecisionTreeClassifier(), - ObliqueDecisionTreeClassifier(), - ], -) -@pytest.mark.parametrize( - "permute_forest_fraction", - [ - None, - 0.5, - ], -) -@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) -def test_iris_pauc_statistic( - criterion, honest_prior, estimator, permute_forest_fraction, sample_dataset_per_tree -): - limit = 0.1 - max_features = "sqrt" - n_repeats = 200 - n_estimators = 25 - test_size = 0.2 - - # Check consistency on dataset iris. - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - criterion=criterion, - n_estimators=n_estimators, - max_features=max_features, - tree_estimator=estimator, - honest_prior=honest_prior, - random_state=0, - n_jobs=1, + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=bootstrap, + max_samples=1.6, + max_features=0.3, + stratify=True, + tree_estimator=MultiViewDecisionTreeClassifier( + feature_set_ends=feature_set_ends, + apply_max_features_per_feature_set=True, ), - test_size=test_size, - sample_dataset_per_tree=sample_dataset_per_tree, - permute_forest_fraction=permute_forest_fraction, ) - # now add completely uninformative feature - X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4)))) - - # test for unimportant feature set - clf.reset() - if sample_dataset_per_tree and permute_forest_fraction is None: - # test in another test - return - - stat, pvalue = clf.test( + result = build_coleman_forest( + clf, + perm_clf, X, - iris_y, - np.arange(iris_X.shape[0], X.shape[1], dtype=int).tolist(), - n_repeats=n_repeats, - metric="auc", - ) - print(pvalue) - assert pvalue > 0.05, f"pvalue: {pvalue}" - - # test for important features that are permuted - stat, pvalue = clf.test(X, iris_y, [0, 1, 2, 3], n_repeats=n_repeats, metric="auc") - print(pvalue) - assert pvalue < 0.05, f"pvalue: {pvalue}" - - # one must call `reset()` to make sure the test is run on a "new" feature set properly - with pytest.raises(RuntimeError, match="X must have 8 features"): - clf.statistic(iris_X, iris_y, metric="auc", max_fpr=limit) - - clf.reset() - score = clf.statistic(iris_X, iris_y, metric="auc", max_fpr=limit) - assert score >= 0.8, "Failed with pAUC: {0} for max fpr: {1}".format(score, limit) - - assert isinstance(clf.estimator_, HonestForestClassifier) - - -@pytest.mark.parametrize( - "forest_hyppo", - [ - FeatureImportanceForestRegressor( - estimator=RandomForestRegressor( - n_estimators=10, - ), - random_state=seed, - ), - FeatureImportanceForestClassifier( - estimator=RandomForestClassifier( - n_estimators=10, - ), - random_state=seed, - permute_forest_fraction=None, - sample_dataset_per_tree=False, - ), - ], -) -def test_forestht_check_inputs(forest_hyppo): - n_samples = 100 - n_features = 5 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - - # Test case 1: Valid input - forest_hyppo.statistic(X, y) - - # Test case 2: Invalid input with different number of samples - X_invalid = np.random.rand(n_samples + 1, X.shape[1]) - y_invalid = rng.integers(0, 2, size=n_samples + 1) - with pytest.raises(RuntimeError, match="X must have"): - forest_hyppo.statistic(X_invalid, y_invalid) - - # Test case 3: Invalid input with different number of features - X_invalid = np.random.rand(X.shape[0], n_features + 1) - with pytest.raises(RuntimeError, match="X must have"): - forest_hyppo.statistic(X_invalid, y) - - # Test case 4: Invalid input with incorrect y type target - y_invalid = np.random.rand(X.shape[0]) - with pytest.raises(RuntimeError, match="y must have type"): - forest_hyppo.statistic(X, y_invalid) - - -@pytest.mark.parametrize("backend", ["loky", "threading"]) -@pytest.mark.parametrize("n_jobs", [1, 2]) -def test_parallelization(backend, n_jobs): - """Test parallelization of training forests.""" - n_samples = 20 - n_features = 5 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - - def run_forest(covariate_index=None): - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=n_jobs, honest_fraction=0.2 - ), - test_size=0.5, - ) - pvalue = clf.test(X, y, covariate_index=[covariate_index], metric="mi") - return pvalue - - out = Parallel(n_jobs=-1, backend=backend)( - delayed(run_forest)(covariate_index) for covariate_index in range(n_features) + y, + covariate_index=[1, 2, 3, 4, 5], + n_repeats=1000, + metric="s@98", + return_posteriors=False, + seed=seed, ) - assert len(out) == n_features - - -def test_pickle(tmpdir): - """Test that pickling works and preserves fitted attributes.""" - n_samples = 100 - n_features = 5 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - n_repeats = 1000 - - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 - ), - test_size=0.5, - ) - stat, pvalue = clf.test(X, y, covariate_index=[1], metric="mi", n_repeats=n_repeats) - - with open(Path(tmpdir) / "clf.pkl", "wb") as fpath: - pickle.dump(clf, fpath) - - with open(Path(tmpdir) / "clf.pkl", "rb") as fpath: - clf_pickle = pickle.load(fpath) - - # recompute pvalue manually and compare - pickle_pvalue = ( - 1.0 + (clf_pickle.null_dist_ <= (clf_pickle.permute_stat_ - clf_pickle.observe_stat_)).sum() - ) / (1.0 + n_repeats) - assert pvalue == pickle_pvalue - assert clf_pickle.observe_stat_ == -( - stat - clf_pickle.permute_stat_ - ), f"{clf_pickle.observe_stat_} != {-(stat - clf_pickle.permute_stat_)}" - - attr_list = [ - "test_size", - "observe_samples_", - "y_true_final_", - "observe_posteriors_", - "observe_stat_", - "_is_fitted", - "permute_samples_", - "permute_posteriors_", - "permute_stat_", - "n_samples_test_", - "_n_samples_", - "_metric", - "train_test_samples_", - ] - for attr in attr_list: - assert_array_equal(getattr(clf, attr), getattr(clf_pickle, attr)) - - -@pytest.mark.parametrize( - "permute_forest_fraction", - [None, 0.5], - ids=["no_permute", "permute_forest_fraction"], -) -@pytest.mark.parametrize( - "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"] -) -def test_sample_size_consistency_of_estimator_indices_( - permute_forest_fraction, sample_dataset_per_tree -): - """Test that the test-sample indices are what is expected.""" - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 - ), - test_size=0.5, - permute_forest_fraction=permute_forest_fraction, - sample_dataset_per_tree=sample_dataset_per_tree, - stratify=False, - ) - - n_samples = 100 - n_features = 5 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - - _, posteriors, samples = clf.statistic( - X, y, covariate_index=None, return_posteriors=True, metric="mi" - ) - - if sample_dataset_per_tree or permute_forest_fraction is not None: - # check the non-nans - non_nan_idx = _non_nan_samples(posteriors) - if sample_dataset_per_tree: - assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}" - - sorted_sample_idx = sorted(np.unique(samples)) - sorted_est_samples_idx = sorted( - np.unique(np.concatenate([x[1] for x in clf.train_test_samples_]).flatten()) - ) - assert_array_equal(sorted_sample_idx, non_nan_idx) - - # the sample indices are equal to the output of the train/test indices_ - # only if there are no nans in the posteriors over all the samples - if np.sum(non_nan_idx) == n_samples: - assert_array_equal( - sorted_sample_idx, - sorted_est_samples_idx, - f"{set(sorted_sample_idx) - set(sorted_est_samples_idx)} and " - f"{set(sorted_est_samples_idx) - set(sorted_sample_idx)}", - ) - else: - assert_array_equal( - samples, - sorted(clf.train_test_samples_[0][1]), - err_msg=f"Samples {set(samples) - set(sorted(clf.train_test_samples_[0][1]))}.", - ) - assert len(_non_nan_samples(posteriors)) == len(samples) - - -@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) -@pytest.mark.parametrize("seed", [None, 0]) -def test_sample_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree): - n_samples = 100 - n_features = 5 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - n_estimators = 10 - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2 - ), - test_size=0.5, - permute_forest_fraction=1.0 / n_estimators, - sample_dataset_per_tree=sample_dataset_per_tree, - ) - other_clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 - ), - test_size=0.5, - permute_forest_fraction=None, - sample_dataset_per_tree=sample_dataset_per_tree, - ) - - clf.statistic(X, y, covariate_index=None, metric="mi") - other_clf.statistic(X, y, covariate_index=None, metric="mi") - - # estimator indices should be preserved over multiple calls - estimator_train_test_indices = deepcopy(clf.train_test_samples_) - for idx in range(clf.n_estimators): - assert_array_equal(clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0]) - assert_array_equal(clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1]) - - # if the sample_dataset_per_tree, then the indices should be different across all trees - if sample_dataset_per_tree or clf.permute_forest_fraction > 0.0: - for indices, other_indices in combinations(clf.train_test_samples_, 2): - assert not np.array_equal(indices[0], other_indices[0]) - assert not np.array_equal(indices[1], other_indices[1]) - else: - for indices, other_indices in combinations(clf.train_test_samples_, 2): - assert_array_equal(indices[0], other_indices[0]) - assert_array_equal(indices[1], other_indices[1]) - - # estimator indices should be preserved over multiple calls - estimator_train_test_indices = deepcopy(other_clf.train_test_samples_) - for idx in range(clf.n_estimators): - assert_array_equal( - other_clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0] - ) - assert_array_equal( - other_clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1] - ) - - # when seed is passed, the indices should be deterministic - if seed is not None and sample_dataset_per_tree: - assert_array_equal( - clf.train_test_samples_[idx][0], other_clf.train_test_samples_[idx][0] - ) - assert_array_equal( - clf.train_test_samples_[idx][1], other_clf.train_test_samples_[idx][1] - ) - - -@pytest.mark.parametrize("seed", [None, 0]) -def test_small_dataset_independent(seed): - n_samples = 32 - n_features = 5 - - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5 - ), - test_size=0.2, - permute_forest_fraction=None, - sample_dataset_per_tree=False, - ) - stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") - assert ~np.isnan(pvalue) - assert ~np.isnan(stat) - assert pvalue > 0.05 - - stat, pvalue = clf.test(X, y, metric="mi") - assert_almost_equal(stat, 0.0, decimal=1) - assert pvalue > 0.05 + assert ~np.isnan(result.pvalue) + assert ~np.isnan(result.observe_test_stat) + assert result.pvalue > 0.05, f"{result.pvalue}" @flaky(max_runs=3) -@pytest.mark.parametrize("seed", [None, 0]) +@pytest.mark.parametrize("seed", [10, 0]) def test_small_dataset_dependent(seed): - n_samples = 100 + n_samples = 50 n_features = 5 rng = np.random.default_rng(seed) @@ -562,130 +129,43 @@ def test_small_dataset_dependent(seed): [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] ) # Binary classification - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5 - ), - test_size=0.2, - permute_forest_fraction=None, - sample_dataset_per_tree=False, + n_estimators = 100 + clf = HonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, ) - stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") - assert ~np.isnan(pvalue) - assert ~np.isnan(stat) - assert pvalue <= 0.05 - - stat, pvalue = clf.test(X, y, metric="mi") - assert pvalue <= 0.05 - - -@flaky(max_runs=3) -def test_no_traintest_split(): - n_samples = 500 - n_features = 5 - rng = np.random.default_rng(seed) - - X = rng.uniform(size=(n_samples, n_features)) - X = rng.uniform(size=(n_samples // 2, n_features)) - X2 = X * 2 - X = np.vstack([X, X2]) - y = np.vstack( - [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] - ) # Binary classification - - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=50, - max_features=n_features, - random_state=seed, - n_jobs=1, - honest_fraction=0.5, - ), - test_size=0.2, - train_test_split=False, - permute_forest_fraction=None, - sample_dataset_per_tree=False, + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, ) - stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") - - # since no train-test split, the training is all the data and the testing is none of the data - assert_array_equal(clf.train_test_samples_[0][0], np.arange(n_samples)) - assert_array_equal(clf.train_test_samples_[0][1], np.array([])) - - assert ~np.isnan(pvalue) - assert ~np.isnan(stat) - assert pvalue <= 0.05, f"{pvalue}" + result = build_coleman_forest( + clf, + perm_clf, + X, + y, + covariate_index=[1, 2], + n_repeats=1000, + metric="mi", + return_posteriors=False, + ) + assert ~np.isnan(result.pvalue) + assert ~np.isnan(result.observe_test_stat) + assert result.pvalue <= 0.05 - stat, pvalue = clf.test(X, y, metric="mi") - assert pvalue <= 0.05, f"{pvalue}" + result = build_coleman_forest(clf, perm_clf, X, y, metric="mi", return_posteriors=False) + assert result.pvalue <= 0.05 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - clf.reset() - - stat, pvalue = clf.test(X, y, metric="mi") - assert_almost_equal(stat, 0.0, decimal=1) - assert pvalue > 0.05, f"{pvalue}" - - stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") - assert ~np.isnan(pvalue) - assert ~np.isnan(stat) - assert pvalue > 0.05, f"{pvalue}" - - -@pytest.mark.parametrize("permute_forest_fraction", [1.0 / 10, 0.5, 0.75, 1.0]) -@pytest.mark.parametrize("seed", [None, 0]) -def test_permute_forest_fraction(permute_forest_fraction, seed): - """Test proper handling of random seeds, shuffled covariates and train/test splits.""" - n_estimators = 10 - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2 - ), - test_size=0.5, - permute_forest_fraction=permute_forest_fraction, - stratify=False, - ) - n_samples = 100 - n_features = 5 - X = rng.uniform(size=(n_samples, n_features)) - y = rng.integers(0, 2, size=n_samples) # Binary classification - - _ = clf.statistic(X, y, covariate_index=None, return_posteriors=True, metric="mi") - - seed = None - train_test_splits = list(clf.train_test_samples_) - train_inds = None - test_inds = None - for idx, tree in enumerate(clf.estimator_.estimators_): - # All random seeds of the meta-forest should be as expected, where - # the seed only changes depending on permute forest fraction - if idx % int(permute_forest_fraction * clf.n_estimators) == 0: - prev_seed = seed - seed = clf._seeds[idx] - - assert seed == tree.random_state - assert prev_seed != seed - else: - assert seed == clf._seeds[idx], f"{seed} != {clf._seeds[idx]}" - assert seed == clf._seeds[idx - 1] - - # Next, train/test splits should be consistent for batches of trees - if idx % int(permute_forest_fraction * clf.n_estimators) == 0: - prev_train_inds = train_inds - prev_test_inds = test_inds - - train_inds, test_inds = train_test_splits[idx] - - assert (prev_train_inds != train_inds).any(), f"{prev_train_inds} == {train_inds}" - assert (prev_test_inds != test_inds).any(), f"{prev_test_inds} == {test_inds}" - else: - assert_array_equal(train_inds, train_test_splits[idx][0]) - assert_array_equal(test_inds, train_test_splits[idx][1]) - - -def test_comight_repeated_feature_sets(): +@pytest.mark.parametrize("seed", [10, 0]) +def test_comight_repeated_feature_sets(seed): """Test COMIGHT when there are repeated feature sets.""" n_samples = 50 n_features = 500 @@ -698,86 +178,58 @@ def test_comight_repeated_feature_sets(): X = np.vstack([X, X2]) y = np.vstack([np.zeros((n_samples, 1)), np.ones((n_samples, 1))]) # Binary classification - X = np.hstack((X, X)) + X = np.hstack((X, X + rng.standard_normal(size=(n_samples * 2, n_features)) * 1e-5)) feature_set_ends = [n_features, n_features * 2] - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=50, - random_state=seed, - n_jobs=1, - honest_fraction=0.5, - tree_estimator=MultiViewDecisionTreeClassifier( - feature_set_ends=feature_set_ends, - max_features=0.3, - apply_max_features_per_feature_set=True, - ), + n_estimators = 50 + clf = HonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, + max_features=0.3, + stratify=True, + tree_estimator=MultiViewDecisionTreeClassifier( + feature_set_ends=feature_set_ends, + apply_max_features_per_feature_set=True, ), - test_size=0.2, - permute_forest_fraction=None, - sample_dataset_per_tree=False, + ) + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, + max_features=0.3, + stratify=True, + tree_estimator=MultiViewDecisionTreeClassifier( + feature_set_ends=feature_set_ends, + apply_max_features_per_feature_set=True, + ), ) # first test MIGHT rejects the null, since there is information - stat, pvalue = clf.test(X, y, metric="mi") - assert pvalue < 0.05 + result = build_coleman_forest( + clf, perm_clf, X, y, n_repeats=1000, metric="mi", return_posteriors=False + ) + assert result.pvalue < 0.05 # second test CoMIGHT fails to reject the null, since the information # is entirely contained in the first feature set - stat, pvalue = clf.test(X, y, covariate_index=np.arange(n_features), metric="mi") - assert pvalue > 0.05, f"{pvalue}" - - -def test_null_with_partial_auc(): - limit = 0.1 - max_features = "sqrt" - n_repeats = 1000 - n_estimators = 20 - test_size = 0.2 - - # Check consistency on dataset iris. - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - random_state=0, - n_jobs=1, - ), - test_size=test_size, - random_state=0, - ) - # now add completely uninformative feature - X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4)))) - - stat, pvalue = clf.test( + result = build_coleman_forest( + clf, + perm_clf, X, - iris_y, - covariate_index=np.arange(2), - n_repeats=n_repeats, - metric="auc", - ) - first_null_dist = deepcopy(clf.null_dist_) - - # If we re-run it with a different seed, but now specifying max_fpr - # there should be a difference in the partial-AUC distribution - clf = FeatureImportanceForestClassifier( - estimator=HonestForestClassifier( - n_estimators=n_estimators, - max_features=max_features, - random_state=0, - n_jobs=1, - ), - test_size=test_size, - random_state=seed, - ) - stat, pvalue = clf.test( - X, iris_y, covariate_index=np.arange(2), n_repeats=n_repeats, metric="auc", max_fpr=limit + y, + n_repeats=1000, + covariate_index=np.arange(n_features), + metric="mi", + return_posteriors=False, ) - second_null_dist = clf.null_dist_ - null_dist_pvalue = wilcoxon(first_null_dist, second_null_dist).pvalue - assert null_dist_pvalue < 0.05, null_dist_pvalue - assert pvalue > 0.05, f"pvalue: {pvalue}" + assert result.pvalue > 0.05, f"{result.pvalue}" def test_build_coleman_forest(): diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index 3ad9c065a..31232a38e 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -103,6 +103,9 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator): n_estimators = 10 # Check consistency on dataset iris. + # Note: bootstrap is False here for backwards compatibility and making + # the unit-test pass. Since bootstrap is not the feature being tested + # here, this is fine. clf = HonestForestClassifier( criterion=criterion, random_state=0, @@ -110,6 +113,7 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator): n_estimators=n_estimators, honest_prior=honest_prior, tree_estimator=estimator, + bootstrap=False, ) second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)]) @@ -259,7 +263,7 @@ def test_honest_decision_function(honest_fraction, val): @parametrize_with_checks( - [HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0)] + [HonestForestClassifier(n_estimators=10, honest_fraction=0.5, random_state=0, bootstrap=False)] ) def test_sklearn_compatible_estimator(estimator, check): # 1. check_class_weight_classifiers is not supported since it requires sample weight