-
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Adam Li <adam2392@gmail.com>
- Loading branch information
Showing
105 changed files
with
5,497 additions
and
1,959 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Sphinx build info version 1 | ||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. | ||
config: 08d17cba0db21befacb226b9f7a187ed | ||
config: b0c18907bb177ae895c6d330acbc0e3f | ||
tags: 645f666f9bcd5a90fca523b33c5a78b7 |
Binary file modified
BIN
+4.17 KB
(110%)
v0.2/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
Binary file not shown.
Binary file modified
BIN
+5.67 KB
(110%)
v0.2/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
Binary file not shown.
127 changes: 127 additions & 0 deletions
127
v0.2/_downloads/71f52ddc897be17d747e41dffb3b0762/plot_might_auc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
""" | ||
=================================================================================== | ||
Compute partial AUC using Mutual Information for Genuine Hypothesis Testing (MIGHT) | ||
=================================================================================== | ||
An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric | ||
multivariate hypothesis test, on simulated datasets. Here, we present a simulation | ||
of how MIGHT is used to evaluate how a "feature set is important for predicting the target". | ||
We simulate a dataset with 1000 features, 500 samples, and a binary class target | ||
variable. Within each feature set, there is 500 features associated with one feature | ||
set, and another 500 features associated with another feature set. One could think of | ||
these for example as different datasets collected on the same patient in a biomedical setting. | ||
The first feature set (X) is strongly correlated with the target, and the second | ||
feature set (W) is weakly correlated with the target (y). | ||
We then use MIGHT to calculate the partial AUC of these sets. | ||
""" | ||
|
||
import numpy as np | ||
from scipy.special import expit | ||
|
||
from sktree import HonestForestClassifier | ||
from sktree.stats import FeatureImportanceForestClassifier | ||
from sktree.tree import DecisionTreeClassifier | ||
|
||
seed = 12345 | ||
rng = np.random.default_rng(seed) | ||
|
||
# %% | ||
# Simulate data | ||
# ------------- | ||
# We simulate the two feature sets, and the target variable. We then combine them | ||
# into a single dataset to perform hypothesis testing. | ||
|
||
n_samples = 1000 | ||
n_features_set = 500 | ||
mean = 1.0 | ||
sigma = 2.0 | ||
beta = 5.0 | ||
|
||
unimportant_mean = 0.0 | ||
unimportant_sigma = 4.5 | ||
|
||
# first sample the informative features, and then the uniformative features | ||
X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10)) | ||
X_important = np.hstack( | ||
[ | ||
X_important, | ||
rng.normal( | ||
loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10) | ||
), | ||
] | ||
) | ||
|
||
X_unimportant = rng.normal( | ||
loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set) | ||
) | ||
|
||
# simulate the binary target variable | ||
y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples) | ||
|
||
# %% | ||
# Use partial AUC as test statistic | ||
# --------------------------------- | ||
# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``. | ||
|
||
n_estimators = 125 | ||
max_features = "sqrt" | ||
metric = "auc" | ||
test_size = 0.2 | ||
n_jobs = -1 | ||
honest_fraction = 0.7 | ||
max_fpr = 0.1 | ||
|
||
est = FeatureImportanceForestClassifier( | ||
estimator=HonestForestClassifier( | ||
n_estimators=n_estimators, | ||
max_features=max_features, | ||
tree_estimator=DecisionTreeClassifier(), | ||
random_state=seed, | ||
honest_fraction=honest_fraction, | ||
n_jobs=n_jobs, | ||
), | ||
random_state=seed, | ||
test_size=test_size, | ||
permute_per_tree=True, | ||
sample_dataset_per_tree=True, | ||
) | ||
|
||
# we test for the first feature set, which is important and thus should return a higher AUC | ||
stat, posterior_arr, samples = est.statistic( | ||
X_important, | ||
y, | ||
metric=metric, | ||
return_posteriors=True, | ||
) | ||
|
||
print(f"ASH-90 / Partial AUC: {stat}") | ||
print(f"Shape of Observed Samples: {samples.shape}") | ||
print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") | ||
|
||
# %% | ||
# Repeat for the second feature set | ||
# --------------------------------- | ||
# This feature set has a smaller statistic, which is expected due to its weak correlation. | ||
|
||
stat, posterior_arr, samples = est.statistic( | ||
X_unimportant, | ||
y, | ||
metric=metric, | ||
return_posteriors=True, | ||
) | ||
|
||
print(f"ASH-90 / Partial AUC: {stat}") | ||
print(f"Shape of Observed Samples: {samples.shape}") | ||
print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}") | ||
|
||
# %% | ||
# All posteriors are saved within the model | ||
# ----------------------------------------- | ||
# Extract the results from the model variables anytime. You can save the model with ``pickle``. | ||
# | ||
# ASH-90 / Partial AUC: ``est.observe_stat_`` | ||
# Observed Samples: ``est.observe_samples_`` | ||
# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1) | ||
# True Labels: ``est.y_true_final_`` |
104 changes: 104 additions & 0 deletions
104
v0.2/_downloads/b3e7bddab5998eaeaa7413978232d146/plot_might_auc.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"\n# Compute partial AUC using Mutual Information for Genuine Hypothesis Testing (MIGHT)\n\nAn example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric\nmultivariate hypothesis test, on simulated datasets. Here, we present a simulation\nof how MIGHT is used to evaluate how a \"feature set is important for predicting the target\".\n\nWe simulate a dataset with 1000 features, 500 samples, and a binary class target\nvariable. Within each feature set, there is 500 features associated with one feature\nset, and another 500 features associated with another feature set. One could think of\nthese for example as different datasets collected on the same patient in a biomedical setting.\nThe first feature set (X) is strongly correlated with the target, and the second\nfeature set (W) is weakly correlated with the target (y).\n\nWe then use MIGHT to calculate the partial AUC of these sets.\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\nfrom scipy.special import expit\n\nfrom sktree import HonestForestClassifier\nfrom sktree.stats import FeatureImportanceForestClassifier\nfrom sktree.tree import DecisionTreeClassifier\n\nseed = 12345\nrng = np.random.default_rng(seed)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Simulate data\nWe simulate the two feature sets, and the target variable. We then combine them\ninto a single dataset to perform hypothesis testing.\n\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"n_samples = 1000\nn_features_set = 500\nmean = 1.0\nsigma = 2.0\nbeta = 5.0\n\nunimportant_mean = 0.0\nunimportant_sigma = 4.5\n\n# first sample the informative features, and then the uniformative features\nX_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))\nX_important = np.hstack(\n [\n X_important,\n rng.normal(\n loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)\n ),\n ]\n)\n\nX_unimportant = rng.normal(\n loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)\n)\n\n# simulate the binary target variable\ny = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Use partial AUC as test statistic\nYou can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``.\n\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"n_estimators = 125\nmax_features = \"sqrt\"\nmetric = \"auc\"\ntest_size = 0.2\nn_jobs = -1\nhonest_fraction = 0.7\nmax_fpr = 0.1\n\nest = FeatureImportanceForestClassifier(\n estimator=HonestForestClassifier(\n n_estimators=n_estimators,\n max_features=max_features,\n tree_estimator=DecisionTreeClassifier(),\n random_state=seed,\n honest_fraction=honest_fraction,\n n_jobs=n_jobs,\n ),\n random_state=seed,\n test_size=test_size,\n permute_per_tree=True,\n sample_dataset_per_tree=True,\n)\n\n# we test for the first feature set, which is important and thus should return a higher AUC\nstat, posterior_arr, samples = est.statistic(\n X_important,\n y,\n metric=metric,\n return_posteriors=True,\n)\n\nprint(f\"ASH-90 / Partial AUC: {stat}\")\nprint(f\"Shape of Observed Samples: {samples.shape}\")\nprint(f\"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Repeat for the second feature set\nThis feature set has a smaller statistic, which is expected due to its weak correlation.\n\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"stat, posterior_arr, samples = est.statistic(\n X_unimportant,\n y,\n metric=metric,\n return_posteriors=True,\n)\n\nprint(f\"ASH-90 / Partial AUC: {stat}\")\nprint(f\"Shape of Observed Samples: {samples.shape}\")\nprint(f\"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## All posteriors are saved within the model\nExtract the results from the model variables anytime. You can save the model with ``pickle``.\n\nASH-90 / Partial AUC: ``est.observe_stat_``\nObserved Samples: ``est.observe_samples_``\nTree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1)\nTrue Labels: ``est.y_true_final_``\n\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
Binary file modified
BIN
+131 Bytes
(100%)
v0.2/_images/sphx_glr_plot_extended_isolation_forest_006.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
-1.81 KB
(98%)
v0.2/_images/sphx_glr_plot_extended_isolation_forest_007.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
-2.49 KB
(97%)
v0.2/_images/sphx_glr_plot_extended_isolation_forest_008.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
+298 Bytes
(100%)
v0.2/_images/sphx_glr_plot_extended_isolation_forest_009.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
+4.72 KB
(100%)
v0.2/_images/sphx_glr_plot_extended_isolation_forest_010.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
-144 Bytes
(100%)
v0.2/_images/sphx_glr_plot_extra_oblique_random_forest_001.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
-498 Bytes
(83%)
v0.2/_images/sphx_glr_plot_extra_oblique_random_forest_thumb.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
-260 Bytes
(94%)
v0.2/_images/sphx_glr_plot_extra_orf_sample_size_thumb.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified
BIN
+40 Bytes
(100%)
v0.2/_images/sphx_glr_plot_oblique_random_forest_thumb.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.