Skip to content

Commit

Permalink
Add v0.2 and new dev docs
Browse files Browse the repository at this point in the history
Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 committed Oct 6, 2023
1 parent bce9d9d commit 40ef42a
Show file tree
Hide file tree
Showing 321 changed files with 86,484 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ build-install/
coverage
*.xml
.venv
.asv/
doc/

sktree/*
examples/*
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
4 changes: 4 additions & 0 deletions v0.2/.buildinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 08d17cba0db21befacb226b9f7a187ed
tags: 645f666f9bcd5a90fca523b33c5a78b7
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
===========================================================
Mutual Information for Gigantic Hypothesis Testing (MIGHT)
===========================================================
An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
multivariate hypothesis test, on simulated datasets. Here, we present a simulation
of how MIGHT is used to test the hypothesis that a "feature set is important for
predicting the target". This is a generalization of the framework presented in
:footcite:`coleman2022scalable`.
We simulate a dataset with 1000 features, 500 samples, and a binary class target
variable. Within each feature set, there is 500 features associated with one feature
set, and another 500 features associated with another feature set. One could think of
these for example as different datasets collected on the same patient in a biomedical setting.
The first feature set (X) is strongly correlated with the target, and the second
feature set (W) is weakly correlated with the target (y). Here, we are testing the
null hypothesis:
- ``H0: I(X; y) - I(X, W; y) = 0``
- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with
respect to ``y``
where ``I`` is mutual information. For example, this could be true in the following settings,
where X is our informative feature set and W is our uninformative feature set.
- ``W X -> y``: here ``W`` is completely disconnected from X and y.
- ``W -> X -> y``: here ``W`` is d-separated from y given X.
- ``W <- X -> y``: here ``W`` is d-separated from y given X.
We then use MIGHT to test the hypothesis that the first feature set is important for
predicting the target, and the second feature set is not important for predicting the
target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`.
"""

import numpy as np
from scipy.special import expit

from sktree import HonestForestClassifier
from sktree.stats import FeatureImportanceForestClassifier
from sktree.tree import DecisionTreeClassifier

seed = 12345
rng = np.random.default_rng(seed)

# %%
# Simulate data
# -------------
# We simulate the two feature sets, and the target variable. We then combine them
# into a single dataset to perform hypothesis testing.

n_samples = 1000
n_features_set = 500
mean = 1.0
sigma = 2.0
beta = 5.0

unimportant_mean = 0.0
unimportant_sigma = 4.5

# first sample the informative features, and then the uniformative features
X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
X_important = np.hstack(
[
X_important,
rng.normal(
loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
),
]
)

X_unimportant = rng.normal(
loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
)
X = np.hstack([X_important, X_unimportant])

# simulate the binary target variable
y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)

# %%
# Perform hypothesis testing using Mutual Information
# ---------------------------------------------------
# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
# between two forests. One forest is trained on the original dataset, and one forest is trained
# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
#
# The null distribution is then estimated in an efficient manner using the framework of
# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
# computed as the proportion of samples in the null distribution that are less than the
# observed test statistic.

n_estimators = 200
max_features = "sqrt"
test_size = 0.2
n_repeats = 1000
n_jobs = -1

est = FeatureImportanceForestClassifier(
estimator=HonestForestClassifier(
n_estimators=n_estimators,
max_features=max_features,
tree_estimator=DecisionTreeClassifier(),
random_state=seed,
honest_fraction=0.7,
n_jobs=n_jobs,
),
random_state=seed,
test_size=test_size,
permute_per_tree=True,
sample_dataset_per_tree=False,
)

print(
f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: "
f"{est.sample_dataset_per_tree}"
)
# we test for the first feature set, which is important and thus should return a pvalue < 0.05
stat, pvalue = est.test(
X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
)
print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")

# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
stat, pvalue = est.test(
X,
y,
covariate_index=np.arange(n_features_set, dtype=int) + n_features_set,
metric="mi",
n_repeats=n_repeats,
)
print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")

# %%
# References
# ----------
# .. footbibliography::
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n# Plot oblique forest and axis-aligned random forest predictions on cc18 datasets\n\nA performance comparison between oblique forest and standard axis-\naligned random forest using three datasets from OpenML benchmarking suites.\n\nTwo of these datasets, namely\n[WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)\nand [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534)\ndatasets consist of 31 features where the former dataset is entirely numeric\nand the latter dataset is entirely norminal. The third dataset, dubbed\n[cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a\nnumeric dataset that has notably large feature space of 857 features. As you\nwill notice, of these three datasets, the oblique forest outperforms axis-aligned\nrandom forest on cnae-9 utilizing sparse random projection mechanism. All datasets\nare subsampled due to computational constraints.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from datetime import datetime\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport seaborn as sns\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import RepeatedKFold, cross_validate\n\nfrom sktree import ObliqueRandomForestClassifier\n\nrandom_state = 123456\nt0 = datetime.now()\ndata_ids = [4534, 1510, 1468] # openml dataset id\ndf = pd.DataFrame()\n\n\ndef load_cc18(data_id):\n df = fetch_openml(data_id=data_id, as_frame=True, parser=\"pandas\")\n\n # extract the dataset name\n d_name = df.details[\"name\"]\n\n # Subsampling large datasets\n if data_id == 1468:\n n = 100\n else:\n n = int(df.frame.shape[0] * 0.8)\n\n df = df.frame.sample(n, random_state=random_state)\n X, y = df.iloc[:, :-1], df.iloc[:, -1]\n\n return X, y, d_name\n\n\ndef get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs):\n clfs = [RandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)]\n\n tmp = []\n\n for i, clf in enumerate(clfs):\n cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs[\"random_state\"])\n test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring=\"accuracy\")\n\n tmp.append(\n [\n d_name,\n [\"RF\", \"OF\"][i],\n test_score[\"test_score\"],\n test_score[\"test_score\"].mean(),\n ]\n )\n\n df = pd.DataFrame(\n tmp, columns=[\"dataset\", \"model\", \"score\", \"mean\"]\n ) # dtype=[('model',object), ('score',float), ('mean',float)])\n df = df.explode(\"score\")\n df[\"score\"] = df[\"score\"].astype(float)\n df.reset_index(inplace=True, drop=True)\n\n return df\n\n\nparams = {\n \"max_features\": None,\n \"n_estimators\": 50,\n \"max_depth\": None,\n \"random_state\": random_state,\n \"n_cv\": 2,\n \"n_repeats\": 1,\n}\n\nfor data_id in data_ids:\n X, y, d_name = load_cc18(data_id=data_id)\n print(f\"Loading [{d_name}] dataset..\")\n tmp = get_scores(X=X, y=y, d_name=d_name, **params)\n df = pd.concat([df, tmp])\n\nprint(f\"It took {(datetime.now()-t0).seconds} seconds to run the script\")\n\n# Draw a comparison plot\nd_names = df.dataset.unique()\nN = d_names.shape[0]\n\nfig, ax = plt.subplots(1, N)\nfig.set_size_inches(6 * N, 6)\n\nfor i, name in enumerate(d_names):\n sns.stripplot(\n data=df.query(f'dataset == \"{name}\"'),\n x=\"model\",\n y=\"score\",\n ax=ax[i],\n dodge=True,\n )\n sns.boxplot(\n data=df.query(f'dataset == \"{name}\"'),\n x=\"model\",\n y=\"score\",\n ax=ax[i],\n color=\"white\",\n )\n ax[i].set_title(name)\n if i != 0:\n ax[i].set_ylabel(\"\")\n ax[i].set_xlabel(\"\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n# Compare extra oblique forest and oblique random forest predictions on cc18 datasets\n\nA performance comparison between extra oblique forest and standard oblique random\nforest using four datasets from OpenML benchmarking suites.\n\nExtra oblique forest uses extra oblique trees as base model which differ from classic\ndecision trees in the way they are built. When looking for the best split to\nseparate the samples of a node into two groups, random splits are drawn for each\nof the `max_features` randomly selected features and the best split among those is\nchosen. This is in contrast with the greedy approach, which evaluates the best possible\nthreshold for each chosen split. For details of the original extra-tree, see [1]_.\n\nThe datasets used in this example are from the OpenML benchmarking suite are:\n\n* [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534)\n* [WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)\n* [Lsvt](https://www.openml.org/search?type=data&sort=runs&id=1484)\n* [har](https://www.openml.org/search?type=data&sort=runs&id=1478)\n* [cnae-9](https://www.openml.org/search?type=data&sort=runs&id==1468)\n\nLarge datasets are subsampled due to computational constraints for running\nthis example. Note that `cnae-9` is\nan high dimensional dataset with very sparse 856 features, mostly consisting of zeros.\n\n+------------------+-------------+--------------+----------+\n| Dataset | # Samples | # Features | Datatype |\n+==================+=============+==============+==========+\n| Phishing Website | 2000 | 30 | nominal |\n+------------------+-------------+--------------+----------+\n| WDBC | 455 | 30 | numeric |\n+------------------+-------------+--------------+----------+\n| Lsvt | 100 | 310 | numeric |\n+------------------+-------------+--------------+----------+\n| har | 2000 | 561 | numeric |\n+------------------+-------------+--------------+----------+\n| cnae-9 | 864 | 856 | numeric |\n+------------------+-------------+--------------+----------+\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>In the following example, the parameters `max_depth` and 'max_features` are\n set deliberately low in order to allow the example to run in our CI test suite.\n For normal usage, these parameters should be set to appropriate values depending\n on the dataset.</p></div>\n\n## Discussion\nExtra Oblique Tree demonstrates performance similar to that of regular Oblique Tree on average\nwith some increase in variance. See [1]_ for a detailed discussion on the bias-variance tradeoff\nof extra-trees vs normal trees.\n\nHowever, Extra Oblique Tree runs substantially faster than Oblique Tree on some datasets due to\nthe random split process which omits the computationally expensive search for the best split.\nThe main source of increase in speed stems from the omission of sorting samples during the\nsplitting of a node. In the standard trees, samples are sorted in ascending order to determine the\nbest split hence the complexity is `O(n\\log(n))`. In Extra trees, samples\nare not sorted and the split is determined by randomly drawing a threshold from the feature's\nrange, hence the complexity is `O(n)`. This makes the algorithm more suitable for large datasets.\n\n## References\n.. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\", Machine Learning, 63(1),\n 3-42, 2006.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from datetime import datetime\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport seaborn as sns\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.model_selection import RepeatedKFold, cross_validate\n\nfrom sktree import ExtraObliqueRandomForestClassifier, ObliqueRandomForestClassifier\n\n# Model parameters\nmax_depth = 3\nmax_features = \"sqrt\"\nmax_sample_size = 2000\nrandom_state = 123\nn_estimators = 50\n\n# Datasets\nphishing_website = 4534\nwdbc = 1510\nlsvt = 1484\nhar = 1478\ncnae_9 = 1468\n\ndata_ids = [phishing_website, wdbc, lsvt, har, cnae_9]\ndf = pd.DataFrame()\n\n\ndef load_cc18(data_id):\n df = fetch_openml(data_id=data_id, as_frame=True, parser=\"pandas\")\n\n # extract the dataset name\n d_name = df.details[\"name\"]\n\n # Subsampling large datasets\n n = int(df.frame.shape[0] * 0.8)\n\n if n > max_sample_size:\n n = max_sample_size\n\n df = df.frame.sample(n, random_state=random_state)\n X, y = df.iloc[:, :-1], df.iloc[:, -1]\n\n return X, y, d_name\n\n\ndef get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs):\n clfs = [ExtraObliqueRandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)]\n dim = X.shape\n tmp = []\n\n for i, clf in enumerate(clfs):\n t0 = datetime.now()\n cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs[\"random_state\"])\n test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring=\"accuracy\")\n time_taken = datetime.now() - t0\n # convert the time taken to seconds\n time_taken = time_taken.total_seconds()\n\n tmp.append(\n [\n d_name,\n dim,\n [\"EORF\", \"ORF\"][i],\n test_score[\"test_score\"],\n test_score[\"test_score\"].mean(),\n time_taken,\n ]\n )\n\n df = pd.DataFrame(tmp, columns=[\"dataset\", \"dimension\", \"model\", \"score\", \"mean\", \"time_taken\"])\n df = df.explode(\"score\")\n df[\"score\"] = df[\"score\"].astype(float)\n df.reset_index(inplace=True, drop=True)\n\n return df\n\n\nparams = {\n \"max_features\": max_features,\n \"n_estimators\": n_estimators,\n \"max_depth\": max_depth,\n \"random_state\": random_state,\n \"n_cv\": 10,\n \"n_repeats\": 1,\n}\n\nfor data_id in data_ids:\n X, y, d_name = load_cc18(data_id=data_id)\n tmp = get_scores(X=X, y=y, d_name=d_name, **params)\n df = pd.concat([df, tmp])\n\n# Show the time taken to train each model\nprint(pd.DataFrame.from_dict(params, orient=\"index\", columns=[\"value\"]))\nprint(df.groupby([\"dataset\", \"dimension\", \"model\"])[[\"time_taken\"]].mean())\n\n# Draw a comparison plot\nd_names = df.dataset.unique()\nN = d_names.shape[0]\n\nfig, ax = plt.subplots(1, N)\nfig.set_size_inches(6 * N, 6)\n\nfor i, name in enumerate(d_names):\n sns.stripplot(\n data=df.query(f'dataset == \"{name}\"'),\n x=\"model\",\n y=\"score\",\n ax=ax[i],\n dodge=True,\n )\n sns.boxplot(\n data=df.query(f'dataset == \"{name}\"'),\n x=\"model\",\n y=\"score\",\n ax=ax[i],\n color=\"white\",\n )\n ax[i].set_title(name)\n if i != 0:\n ax[i].set_ylabel(\"\")\n ax[i].set_xlabel(\"\")\n# show the figure\nplt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit 40ef42a

Please sign in to comment.