-
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Adam Li <adam2392@gmail.com>
- Loading branch information
Showing
137 changed files
with
68,516 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,9 @@ coverage | |
*.xml | ||
.venv | ||
|
||
sktree/* | ||
examples/* | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
docs/generated/ | ||
|
Binary file added
BIN
+25.6 KB
stable/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
Binary file not shown.
54 changes: 54 additions & 0 deletions
54
stable/_downloads/08d879f13e283922a169e8984e0c3b50/plot_oblique_random_forest.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"%matplotlib inline" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"\n# Plot oblique forest and axis-aligned random forest predictions on cc18 datasets\n\nA performance comparison between oblique forest and standard axis-\naligned random forest using three datasets from OpenML benchmarking suites.\n\nTwo of these datasets, namely\n[WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)\nand [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534)\ndatasets consist of 31 features where the former dataset is entirely numeric\nand the latter dataset is entirely norminal. The third dataset, dubbed\n[cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a\nnumeric dataset that has notably large feature space of 857 features. As you\nwill notice, of these three datasets, the oblique forest outperforms axis-aligned\nrandom forest on cnae-9 utilizing sparse random projection mechanism. All datasets\nare subsampled due to computational constraints.\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from datetime import datetime\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport seaborn as sns\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import RepeatedKFold, cross_validate\n\nfrom sktree import ObliqueRandomForestClassifier\n\nrandom_state = 123456\nt0 = datetime.now()\ndata_ids = [4534, 1510, 1468] # openml dataset id\ndf = pd.DataFrame()\n\n\ndef load_cc18(data_id):\n df = fetch_openml(data_id=data_id, as_frame=True, parser=\"pandas\")\n\n # extract the dataset name\n d_name = df.details[\"name\"]\n\n # Subsampling large datasets\n if data_id == 1468:\n n = 100\n else:\n n = int(df.frame.shape[0] * 0.8)\n\n df = df.frame.sample(n, random_state=random_state)\n X, y = df.iloc[:, :-1], df.iloc[:, -1]\n\n return X, y, d_name\n\n\ndef get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs):\n clfs = [RandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)]\n\n tmp = []\n\n for i, clf in enumerate(clfs):\n cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs[\"random_state\"])\n test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring=\"accuracy\")\n\n tmp.append(\n [\n d_name,\n [\"RF\", \"OF\"][i],\n test_score[\"test_score\"],\n test_score[\"test_score\"].mean(),\n ]\n )\n\n df = pd.DataFrame(\n tmp, columns=[\"dataset\", \"model\", \"score\", \"mean\"]\n ) # dtype=[('model',object), ('score',float), ('mean',float)])\n df = df.explode(\"score\")\n df[\"score\"] = df[\"score\"].astype(float)\n df.reset_index(inplace=True, drop=True)\n\n return df\n\n\nparams = {\n \"max_features\": None,\n \"n_estimators\": 50,\n \"max_depth\": None,\n \"random_state\": random_state,\n \"n_cv\": 2,\n \"n_repeats\": 1,\n}\n\nfor data_id in data_ids:\n X, y, d_name = load_cc18(data_id=data_id)\n print(f\"Loading [{d_name}] dataset..\")\n tmp = get_scores(X=X, y=y, d_name=d_name, **params)\n df = pd.concat([df, tmp])\n\nprint(f\"It took {(datetime.now()-t0).seconds} seconds to run the script\")\n\n# Draw a comparison plot\nd_names = df.dataset.unique()\nN = d_names.shape[0]\n\nfig, ax = plt.subplots(1, N)\nfig.set_size_inches(6 * N, 6)\n\nfor i, name in enumerate(d_names):\n sns.stripplot(\n data=df.query(f'dataset == \"{name}\"'),\n x=\"model\",\n y=\"score\",\n ax=ax[i],\n dodge=True,\n )\n sns.boxplot(\n data=df.query(f'dataset == \"{name}\"'),\n x=\"model\",\n y=\"score\",\n ax=ax[i],\n color=\"white\",\n )\n ax[i].set_title(name)\n if i != 0:\n ax[i].set_ylabel(\"\")\n ax[i].set_xlabel(\"\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
304 changes: 304 additions & 0 deletions
304
stable/_downloads/0a65e88a0e06f6c9a3b9c7f9b8a19554/plot_projection_matrices.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
""" | ||
=============================================== | ||
Plot the projection matrices of an oblique tree | ||
=============================================== | ||
This example shows how projection matrices are generated for an oblique tree, | ||
specifically the :class:`sktree.tree.PatchObliqueDecisionTreeClassifier`. | ||
For a tree, one can specify the structure of the data that it will be trained on | ||
(i.e. ``(X, y)``). This is done by specifying the ``data_dims`` parameter. For | ||
example, if the data is 2D, then ``data_dims`` should be set to ``(n_rows, n_cols)``, | ||
where now each row of ``X`` is a 1D array of length ``n_rows * n_cols``. If the data | ||
is 3D, then ``data_dims`` should be set to ``(n_rows, n_cols, n_depth)``, where now | ||
each row of ``X`` is a 1D array of length ``n_rows * n_cols * n_depth``. This allows | ||
the tree to be trained on data of any structured dimension, but still be compatible | ||
with the robust sklearn API. | ||
The projection matrices are used to generate patches of the data. These patches are | ||
used to calculate the feature values that are used during splitting. The patch is | ||
generated by sampling a hyperrectangle from the data. The hyperrectangle is defined | ||
by a starting point and a patch size. The starting point is sampled uniformly from | ||
the structure of the data. For example, if each row of ``X`` has a 2D image structure | ||
``(n_rows, n_cols)``, then the starting point will be sampled uniformly from the square | ||
grid. The patch size is sampled uniformly from the range ``min_patch_dims`` to | ||
``max_patch_dims``. The patch size is also constrained to be within the bounds of the | ||
data structure. For example, if the patch size is ``(3, 3)`` and the data structure | ||
is ``(5, 5)``, then the patch will only sample indices within the data. | ||
We also allow each dimension to be arbitrarily discontiguous. | ||
For details on how to use the hyperparameters related to the patches, see | ||
:class:`sktree.tree.PatchObliqueDecisionTreeClassifier`. | ||
""" | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
# import modules | ||
# .. note:: We use a private Cython module here to demonstrate what the patches | ||
# look like. This is not part of the public API. The Cython module used | ||
# is just a Python wrapper for the underlying Cython code and is not the | ||
# same as the Cython splitter used in the actual implementation. | ||
# To use the actual splitter, one should use the public API for the | ||
# relevant tree/forests class. | ||
import numpy as np | ||
|
||
from sktree._lib.sklearn.tree._criterion import Gini | ||
from sktree.tree.manifold._morf_splitter import BestPatchSplitterTester | ||
|
||
# %% | ||
# Initialize patch splitter | ||
# ------------------------- | ||
# The patch splitter is used to generate patches for the projection matrices. | ||
# We will initialize the patch with some dummy values for the sake of this | ||
# example. | ||
|
||
criterion = Gini(1, np.array((0, 1))) | ||
max_features = 6 | ||
min_samples_leaf = 1 | ||
min_weight_leaf = 0.0 | ||
random_state = np.random.RandomState(100) | ||
|
||
boundary = None | ||
feature_weight = None | ||
|
||
# initialize some dummy data | ||
X = np.repeat(np.arange(25).astype(np.float32), 5).reshape(5, -1) | ||
y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64) | ||
sample_weight = np.ones(5) | ||
|
||
print("The shape of our dataset is: ", X.shape, y.shape, sample_weight.shape) | ||
|
||
# %% | ||
# Generate 1D patches | ||
# ------------------- | ||
# Now that we have th patch splitter initialized, we can generate some patches | ||
# and visualize how they appear on the data. We will make the patch 1D, which | ||
# samples multiple rows contiguously. This is a 1D patch of size 3. | ||
min_patch_dims = np.array((1, 1)) | ||
max_patch_dims = np.array((3, 1)) | ||
dim_contiguous = np.array((True, True)) | ||
data_dims = np.array((5, 5)) | ||
|
||
splitter = BestPatchSplitterTester( | ||
criterion, | ||
max_features, | ||
min_samples_leaf, | ||
min_weight_leaf, | ||
random_state, | ||
min_patch_dims, | ||
max_patch_dims, | ||
dim_contiguous, | ||
data_dims, | ||
boundary, | ||
feature_weight, | ||
) | ||
splitter.init_test(X, y, sample_weight, None) | ||
|
||
# sample the projection matrix that consists of 1D patches | ||
proj_mat = splitter.sample_projection_matrix() | ||
print(proj_mat.shape) | ||
|
||
# Visualize 1D patches | ||
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) | ||
axs = axs.flatten() | ||
for idx, ax in enumerate(axs): | ||
ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis") | ||
ax.set( | ||
xlim=(-1, data_dims[1]), | ||
ylim=(-1, data_dims[0]), | ||
title=f"Patch {idx}", | ||
) | ||
|
||
fig.suptitle("1D Patch Visualization") | ||
plt.show() | ||
|
||
# %% | ||
# Generate 2D patches | ||
# ------------------- | ||
# We will make the patch 2D, which samples multiple rows contiguously. This is | ||
# a 2D patch of size 3 in the columns and 2 in the rows. | ||
|
||
min_patch_dims = np.array((1, 1)) | ||
max_patch_dims = np.array((3, 3)) | ||
dim_contiguous = np.array((True, True)) | ||
data_dims = np.array((5, 5)) | ||
|
||
splitter = BestPatchSplitterTester( | ||
criterion, | ||
max_features, | ||
min_samples_leaf, | ||
min_weight_leaf, | ||
random_state, | ||
min_patch_dims, | ||
max_patch_dims, | ||
dim_contiguous, | ||
data_dims, | ||
boundary, | ||
feature_weight, | ||
) | ||
splitter.init_test(X, y, sample_weight) | ||
|
||
# sample the projection matrix that consists of 1D patches | ||
proj_mat = splitter.sample_projection_matrix() | ||
|
||
# Visualize 2D patches | ||
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) | ||
axs = axs.flatten() | ||
for idx, ax in enumerate(axs): | ||
ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis") | ||
ax.set( | ||
xlim=(-1, data_dims[1]), | ||
ylim=(-1, data_dims[0]), | ||
title=f"Patch {idx}", | ||
) | ||
|
||
fig.suptitle("2D Patch Visualization") | ||
plt.show() | ||
|
||
# %% | ||
# Generate 3D patches | ||
# ------------------- | ||
|
||
# initialize some dummy data | ||
X = np.repeat(np.arange(25 * 5).astype(np.float32), 5).reshape(5, -1) | ||
y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64) | ||
sample_weight = np.ones(5) | ||
|
||
# We will make the patch 3D, which samples multiple rows contiguously. This is | ||
# a 3D patch of size 3 in the columns and 2 in the rows. | ||
min_patch_dims = np.array((1, 2, 1)) | ||
max_patch_dims = np.array((3, 2, 4)) | ||
dim_contiguous = np.array((True, True, True)) | ||
data_dims = np.array((5, 5, 5)) | ||
|
||
splitter = BestPatchSplitterTester( | ||
criterion, | ||
max_features, | ||
min_samples_leaf, | ||
min_weight_leaf, | ||
random_state, | ||
min_patch_dims, | ||
max_patch_dims, | ||
dim_contiguous, | ||
data_dims, | ||
boundary, | ||
feature_weight, | ||
) | ||
splitter.init_test(X, y, sample_weight) | ||
|
||
# sample the projection matrix that consists of 1D patches | ||
proj_mat = splitter.sample_projection_matrix() | ||
print(proj_mat.shape) | ||
|
||
fig = plt.figure() | ||
for idx in range(3 * 2): | ||
ax = fig.add_subplot(2, 3, idx + 1, projection="3d") | ||
|
||
# Plot the surface. | ||
z, x, y = proj_mat[idx, :].reshape(data_dims).nonzero() | ||
ax.scatter(x, y, z, alpha=1, marker="o", color="black") | ||
|
||
# Customize the z axis. | ||
ax.set_zlim(-1.01, data_dims[2]) | ||
ax.set( | ||
xlim=(-1, data_dims[1]), | ||
ylim=(-1, data_dims[0]), | ||
title=f"Patch {idx}", | ||
) | ||
|
||
fig.suptitle("3D Patch Visualization") | ||
plt.show() | ||
|
||
|
||
# %% | ||
# Discontiguous Patches | ||
# --------------------- | ||
# We can also generate patches that are not contiguous. This is useful for | ||
# analyzing data that is structured, but not necessarily contiguous in certain | ||
# dimensions. For example, we can generate patches that sample the data in a | ||
# multivariate time series, where the data consists of ``(n_channels, n_times)`` | ||
# and the patches are discontiguous in the channel dimension, but contiguous | ||
# in the time dimension. Here, we show an example patch. | ||
|
||
# initialize some dummy data | ||
X = np.repeat(np.arange(25).astype(np.float32), 5).reshape(5, -1) | ||
y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64) | ||
sample_weight = np.ones(5) | ||
max_features = 9 | ||
|
||
# We will make the patch 2D, which samples multiple rows contiguously. This is | ||
# a 2D patch of size 3 in the columns and 2 in the rows. | ||
min_patch_dims = np.array((2, 2)) | ||
max_patch_dims = np.array((3, 4)) | ||
dim_contiguous = np.array((False, True)) | ||
data_dims = np.array((5, 5)) | ||
|
||
splitter = BestPatchSplitterTester( | ||
criterion, | ||
max_features, | ||
min_samples_leaf, | ||
min_weight_leaf, | ||
random_state, | ||
min_patch_dims, | ||
max_patch_dims, | ||
dim_contiguous, | ||
data_dims, | ||
boundary, | ||
feature_weight, | ||
) | ||
splitter.init_test(X, y, sample_weight) | ||
|
||
# sample the projection matrix that consists of 1D patches | ||
proj_mat = splitter.sample_projection_matrix() | ||
|
||
# Visualize 2D patches | ||
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) | ||
axs = axs.flatten() | ||
for idx, ax in enumerate(axs): | ||
ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis") | ||
ax.set( | ||
xlim=(-1, data_dims[1]), | ||
ylim=(-1, data_dims[0]), | ||
title=f"Patch {idx}", | ||
) | ||
|
||
fig.suptitle("2D Discontiguous Patch Visualization") | ||
plt.show() | ||
|
||
# %% | ||
# We will make the patch 2D, which samples multiple rows contiguously. This is | ||
# a 2D patch of size 3 in the columns and 2 in the rows. | ||
dim_contiguous = np.array((False, False)) | ||
|
||
splitter = BestPatchSplitterTester( | ||
criterion, | ||
max_features, | ||
min_samples_leaf, | ||
min_weight_leaf, | ||
random_state, | ||
min_patch_dims, | ||
max_patch_dims, | ||
dim_contiguous, | ||
data_dims, | ||
boundary, | ||
feature_weight, | ||
) | ||
splitter.init_test(X, y, sample_weight) | ||
|
||
# sample the projection matrix that consists of 1D patches | ||
proj_mat = splitter.sample_projection_matrix() | ||
|
||
# Visualize 2D patches | ||
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) | ||
axs = axs.flatten() | ||
for idx, ax in enumerate(axs): | ||
ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis") | ||
ax.set( | ||
xlim=(-1, data_dims[1]), | ||
ylim=(-1, data_dims[0]), | ||
title=f"Patch {idx}", | ||
) | ||
|
||
fig.suptitle("2D Discontiguous In All Dims Patch Visualization") | ||
plt.show() |
Oops, something went wrong.