diff --git a/.gitignore b/.gitignore index 39a3a651a..db44ed79b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ doc/coverages doc/samples cover examples/*.jpg +examples/**/*.jpg env/ html/ diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 1f76b888f..38685efad 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -22,8 +22,8 @@ more information and intuition, see .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_plot_oblique_random_forest.py` - * :ref:`sphx_glr_auto_examples_plot_oblique_axis_aligned_forests_sparse_parity.py` + * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py` + * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_axis_aligned_forests_sparse_parity.py` .. topic:: References diff --git a/doc/modules/supervised_tree.rst b/doc/modules/supervised_tree.rst index 91824e04c..1aa0a602a 100644 --- a/doc/modules/supervised_tree.rst +++ b/doc/modules/supervised_tree.rst @@ -41,7 +41,7 @@ as follows: >>> clf = tree.ObliqueDecisionTreeClassifier() >>> clf = clf.fit(X, y) -.. figure:: ../auto_examples/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/sklearn_vs_sktree/images/sphx_glr_plot_iris_dtc_002.png :target: ../auto_examples/plot_iris_dtc.html :align: center diff --git a/examples/calibration/README.txt b/examples/calibration/README.txt new file mode 100644 index 000000000..84d5f5758 --- /dev/null +++ b/examples/calibration/README.txt @@ -0,0 +1,6 @@ +.. _calibration_examples: + +Calibrated decision trees via honesty +------------------------------------- + +Examples demonstrating the usage of honest decision trees to obtain calibrated predictions. diff --git a/examples/plot_overlapping_gaussians.py b/examples/calibration/plot_overlapping_gaussians.py similarity index 100% rename from examples/plot_overlapping_gaussians.py rename to examples/calibration/plot_overlapping_gaussians.py diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py new file mode 100644 index 000000000..882f80c3d --- /dev/null +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -0,0 +1,237 @@ +""" +=============================================================================== +Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data +=============================================================================== + +Here, we demonstrate how to do hypothesis testing on highly imbalanced data +in terms of their feature-set dimensionalities. +using mutual information as a test statistic. We use the framework of +:footcite:`coleman2022scalable` to estimate pvalues efficiently. + +Here, we simulate two feature sets, one of which is important for the target, +but significantly smaller in dimensionality than the other feature set, which +is unimportant for the target. We then use the MIGHT framework to test for +the importance of each feature set. Instead of leveraging a normal honest random +forest to estimate the posteriors, here we leverage a multi-view honest random +forest, with knowledge of the multi-view structure of the ``X`` data. + +For other examples of hypothesis testing, see the following: + +- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_gigantic_hypothesis_testing_forest.py` +- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py` + +For more information on the multi-view decision-tree, see +:ref:`sphx_glr_auto_examples_multiview_plot_multiview_dtc.py`. +""" + +import matplotlib.pyplot as plt +import numpy as np +from sklearn.datasets import make_blobs + +from sktree import HonestForestClassifier +from sktree.stats import FeatureImportanceForestClassifier +from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + +# %% +# Simulate data +# ------------- +# We simulate the two feature sets, and the target variable. We then combine them +# into a single dataset to perform hypothesis testing. +seed = 12345 +rng = np.random.default_rng(seed) + + +def make_multiview_classification( + n_samples=100, n_features_1=10, n_features_2=1000, cluster_std=2.0, seed=None +): + rng = np.random.default_rng(seed=seed) + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1 // 2, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1 // 2, + random_state=rng.integers(1, 10000), + centers=1, + ) + + # create the first views for y=0 and y=1 + X0_first = np.concatenate( + (X0_first, rng.standard_normal(size=(n_samples, n_features_1 // 2))), axis=1 + ) + X1_first = np.concatenate( + (X1_first, rng.standard_normal(size=(n_samples, n_features_1 // 2))), axis=1 + ) + y1[:] = 1 + + # add the second view for y=0 and y=1, which is completely noise + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + + # combine the views and targets + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + # add noise to the data + X = X + rng.standard_normal(size=X.shape) + + return X, y + + +n_samples = 100 +n_features = 10000 +n_features_views = [10, n_features] + +X, y = make_multiview_classification( + n_samples=n_samples, + n_features_1=10, + n_features_2=n_features, + cluster_std=2.0, + seed=seed, +) +# %% +# Perform hypothesis testing using Mutual Information +# --------------------------------------------------- +# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis +# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated +# between two forests. One forest is trained on the original dataset, and one forest is trained +# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly. +# +# The null distribution is then estimated in an efficient manner using the framework of +# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors) +# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then +# computed as the proportion of samples in the null distribution that are less than the +# observed test statistic. + +n_estimators = 200 +max_features = "sqrt" +test_size = 0.2 +n_repeats = 1000 +n_jobs = -1 + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=MultiViewDecisionTreeClassifier(feature_set_ends=n_features_views), + random_state=seed, + honest_fraction=0.5, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=False, + sample_dataset_per_tree=False, +) + +mv_results = dict() + +print( + f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " + f"{est.sample_dataset_per_tree}" +) +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats +) +mv_results["important_feature_stat"] = stat +mv_results["important_feature_pvalue"] = pvalue +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(10, n_features, dtype=int), + metric="mi", + n_repeats=n_repeats, +) +mv_results["unimportant_feature_stat"] = stat +mv_results["unimportant_feature_pvalue"] = pvalue +print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") + +# %% +# Let's investigate what happens when we do not use a multi-view decision tree. +# All other parameters are kept the same. + +est = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=n_estimators, + max_features=max_features, + tree_estimator=DecisionTreeClassifier(), + random_state=seed, + honest_fraction=0.5, + n_jobs=n_jobs, + ), + random_state=seed, + test_size=test_size, + permute_per_tree=False, + sample_dataset_per_tree=False, +) + +rf_results = dict() + +# we test for the first feature set, which is important and thus should return a pvalue < 0.05 +stat, pvalue = est.test( + X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats +) +rf_results["important_feature_stat"] = stat +rf_results["important_feature_pvalue"] = pvalue +print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") + +# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 +stat, pvalue = est.test( + X, + y, + covariate_index=np.arange(10, n_features, dtype=int), + metric="mi", + n_repeats=n_repeats, +) +rf_results["unimportant_feature_stat"] = stat +rf_results["unimportant_feature_pvalue"] = pvalue +print(f"Estimated MI difference using regular decision-trees: {stat} with Pvalue: {pvalue}") + +fig, ax = plt.subplots(figsize=(5, 3)) + +# plot pvalues +ax.bar(0, rf_results["important_feature_pvalue"], label="Important Feature Set (RF)") +ax.bar(1, rf_results["unimportant_feature_pvalue"], label="Unimportant Feature Set (RF)") +ax.bar(2, mv_results["important_feature_pvalue"], label="Important Feature Set (MV)") +ax.bar(3, mv_results["unimportant_feature_pvalue"], label="Unimportant Feature Set (MV)") +ax.axhline(0.05, color="k", linestyle="--", label="alpha=0.05") +ax.set(ylabel="Log10(PValue)", xlim=[-0.5, 3.5], yscale="log") +ax.legend() + +fig.tight_layout() +plt.show() + +# %% +# Discussion +# ---------- +# We see that the multi-view decision tree is able to detect the important feature set, +# while the regular decision tree is not. This is because the regular decision tree +# is not aware of the multi-view structure of the data, and thus is challenged +# by the imbalanced dimensionality of the feature sets. I.e. it rarely splits on +# the first low-dimensional feature set, and thus is unable to detect its importance. +# +# Note both approaches still fail to reject the null hypothesis (for alpha of 0.05) +# when testing the unimportant feature set. The difference in the two approaches +# show the statistical power of the multi-view decision tree is higher than the +# regular decision tree in this simulation. + +# %% +# References +# ---------- +# .. footbibliography:: diff --git a/examples/multiview/README.txt b/examples/multiview/README.txt new file mode 100644 index 000000000..127ad6e57 --- /dev/null +++ b/examples/multiview/README.txt @@ -0,0 +1,6 @@ +.. _multiview_examples: + +Multi-view learning with Decision-trees +--------------------------------------- + +Examples demonstrating multi-view learning using random forest variants. diff --git a/examples/multiview/plot_multiview_dtc.py b/examples/multiview/plot_multiview_dtc.py new file mode 100644 index 000000000..37629093c --- /dev/null +++ b/examples/multiview/plot_multiview_dtc.py @@ -0,0 +1,147 @@ +""" +============================================================ +Analyze a multi-view dataset with a multi-view random forest +============================================================ + +An example using :class:`~sktree.MultiViewRandomForestClassifier` for high-dimensional +classification when there are multiple feature sets that are correlated with the +target variable, ``y``. The multi-view random forest is a variant of the random forest +that samples from each feature set uniformly, instead of sampling from all features +uniformly. This is useful when there are multiple feature sets, and some feature sets +have vastly different dimensionality from others. + +In this case, ``X`` is a matrix of shape ``(n_samples, n_features)``, where +``n_features`` is the sum of the number of features in each feature set. If the multi-view +structure is known, then one can pass this to the multi-view random forest via the +``feature_set_ends`` parameter. + +For a visualization of how the multi-view splitter works, see +:ref:`sphx_glr_auto_examples_splitters_plot_multiview_axis_aligned_splitter.py`. +""" + +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.datasets import make_blobs +from sklearn.model_selection import cross_val_score + +from sktree import MultiViewRandomForestClassifier, RandomForestClassifier + +seed = 12345 +rng = np.random.default_rng(seed) + + +def make_multiview_classification( + n_samples=100, n_features_1=5, n_features_2=1000, cluster_std=2.0, seed=None +): + rng = np.random.default_rng(seed=seed) + + # Create a high-dimensional multiview dataset with a low-dimensional informative + # subspace in one view of the dataset. + X0_first, y0 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + + X1_first, y1 = make_blobs( + n_samples=n_samples, + cluster_std=cluster_std, + n_features=n_features_1, + random_state=rng.integers(1, 10000), + centers=1, + ) + y1[:] = 1 + X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) + X = np.vstack((X0, X1)) + y = np.hstack((y0, y1)).T + + X = X + rng.standard_normal(size=X.shape) + + return X, y + + +# %% +# Simulate data +# ------------- +# We simulate a 2-view dataset with both views containing informative low-dimensional features. +# The first view has five dimensions, while the second view will vary from five to a thousand +# dimensions. The sample-size will be kept fixed, so we can compare the performance of +# regular Random forests with Multi-view Random Forests. + +n_samples = 100 +n_features_views = np.linspace(5, 10000, 5).astype(int) + +datasets = [] +for idx, n_features in enumerate(n_features_views): + X, y = make_multiview_classification( + n_samples=n_samples, + n_features_1=5, + n_features_2=n_features, + cluster_std=2.0, + seed=seed + idx, + ) + datasets.append((X, y)) + +# %% +# Fit Random Forest and Multi-view Random Forest +# ---------------------------------------------- +# Here, we fit both forests over all the datasets. + +n_estimators = 100 +n_jobs = -1 + +scores = defaultdict(list) + +for idx, ((X, y), n_features) in enumerate(zip(datasets, n_features_views)): + feature_set_ends = np.array([5, n_features + 5]) + + rf = RandomForestClassifier( + n_estimators=n_estimators, + n_jobs=n_jobs, + random_state=seed, + ) + + mvrf = MultiViewRandomForestClassifier( + n_estimators=n_estimators, + n_jobs=n_jobs, + feature_set_ends=n_features_views, + random_state=seed, + ) + + # obtain the cross-validation score + rf_score = cross_val_score(rf, X, y, cv=2).mean() + mvrf_score = cross_val_score(mvrf, X, y, cv=2).mean() + + scores["rf"].append(rf_score) + scores["mvrf"].append(mvrf_score) + +# %% +# Visualize scores and compare performance +# ---------------------------------------- +# Now, we can compare the performance from the cross-validation experiment. + +scores["n_features"] = n_features_views +df = pd.DataFrame(scores) + +# melt the dataframe, to make it easier to plot +df = pd.melt(df, id_vars=["n_features"], var_name="model", value_name="score") + +fig, ax = plt.subplots() +sns.lineplot(data=df, x="n_features", y="score", marker="o", hue="model", ax=ax) +ax.set_ylabel("CV Score") +ax.set_xlabel("Number of features in second view") +ax.set_title("Random Forest vs Multi-view Random Forest") +plt.show() + +# %% +# As we can see, the multi-view random forest outperforms the regular random forest +# as the number of features in the second view increases. This is because the multi-view +# random forest samples from each feature-set uniformly, while the regular random forest +# samples from all features uniformly. This is a key difference between the two forests. diff --git a/examples/sklearn_vs_sktree/README.txt b/examples/sklearn_vs_sktree/README.txt new file mode 100644 index 000000000..d942d71f8 --- /dev/null +++ b/examples/sklearn_vs_sktree/README.txt @@ -0,0 +1,6 @@ +.. _sklearn_examples: + +Comparing sklearn and sktree decision trees +------------------------------------------- + +Examples demonstrating the difference between sklearn and sktree decision trees. diff --git a/examples/plot_iris_dtc.py b/examples/sklearn_vs_sktree/plot_iris_dtc.py similarity index 100% rename from examples/plot_iris_dtc.py rename to examples/sklearn_vs_sktree/plot_iris_dtc.py diff --git a/examples/sparse_oblique_trees/README.txt b/examples/sparse_oblique_trees/README.txt new file mode 100644 index 000000000..61c596af1 --- /dev/null +++ b/examples/sparse_oblique_trees/README.txt @@ -0,0 +1,6 @@ +.. _sporf_examples: + +Sparse oblique projections with oblique decision-trees +------------------------------------------------------ + +Examples demonstrating learning using oblique random forests. diff --git a/examples/plot_extra_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py similarity index 98% rename from examples/plot_extra_oblique_random_forest.py rename to examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py index 586edd833..fd920cc16 100644 --- a/examples/plot_extra_oblique_random_forest.py +++ b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py @@ -59,7 +59,7 @@ range, hence the complexity is `O(n)`. This makes the algorithm more suitable for large datasets. To see how sample-sizes affect the performance of Extra Oblique Trees vs regular Oblique Trees, -see :ref:`sphx_glr_auto_examples_plot_extra_orf_sample_size.py` +see :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_orf_sample_size.py` References ---------- diff --git a/examples/plot_extra_orf_sample_size.py b/examples/sparse_oblique_trees/plot_extra_orf_sample_size.py similarity index 100% rename from examples/plot_extra_orf_sample_size.py rename to examples/sparse_oblique_trees/plot_extra_orf_sample_size.py diff --git a/examples/plot_oblique_axis_aligned_forests_sparse_parity.py b/examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py similarity index 100% rename from examples/plot_oblique_axis_aligned_forests_sparse_parity.py rename to examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py diff --git a/examples/plot_oblique_forests_iris.py b/examples/sparse_oblique_trees/plot_oblique_forests_iris.py similarity index 100% rename from examples/plot_oblique_forests_iris.py rename to examples/sparse_oblique_trees/plot_oblique_forests_iris.py diff --git a/examples/plot_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_oblique_random_forest.py similarity index 97% rename from examples/plot_oblique_random_forest.py rename to examples/sparse_oblique_trees/plot_oblique_random_forest.py index 34cb8c68a..dca3f19da 100644 --- a/examples/plot_oblique_random_forest.py +++ b/examples/sparse_oblique_trees/plot_oblique_random_forest.py @@ -18,7 +18,7 @@ are subsampled due to computational constraints. For an example of using extra-oblique trees/forests in practice on data, see the following -example :ref:`sphx_glr_auto_examples_plot_extra_oblique_random_forest.py`. +example :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_oblique_random_forest.py`. """ from datetime import datetime diff --git a/examples/splitters/plot_multiview_axis_aligned_splitter.py b/examples/splitters/plot_multiview_axis_aligned_splitter.py index 8fc5b32c2..7ceb200da 100644 --- a/examples/splitters/plot_multiview_axis_aligned_splitter.py +++ b/examples/splitters/plot_multiview_axis_aligned_splitter.py @@ -122,3 +122,6 @@ # In contrast, the normal splitter in :class:`sklearn.tree.DecisionTreeClassifier` samples # randomly across all ``n_features`` features because it is not aware of the multi-view structure. # This is the key difference between the two splitters. +# +# For an example of using the multi-view splitter in practice on simulated data, see +# :ref:`sphx_glr_auto_examples_multiview_plot_multiview_dtc.py`. diff --git a/examples/splitters/plot_sparse_projection_matrix.py b/examples/splitters/plot_sparse_projection_matrix.py index 6eb53e0ca..c77c128a0 100644 --- a/examples/splitters/plot_sparse_projection_matrix.py +++ b/examples/splitters/plot_sparse_projection_matrix.py @@ -125,5 +125,5 @@ # For an example of using oblique trees/forests in practice on data, see the following # examples: # -# - :ref:`sphx_glr_auto_examples_plot_oblique_forests_iris.py` -# - :ref:`sphx_glr_auto_examples_plot_oblique_random_forest.py` +# - :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_forests_iris.py` +# - :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py`