Adding stable docs

Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Jul 6, 2023 · 0231e16 · 0231e16
1 parent 4aa52e1
commit 0231e16
Show file tree

Hide file tree

Showing 137 changed files with 68,516 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,9 @@ coverage
 *.xml
 .venv
 
+sktree/*
+examples/*
+
 # Sphinx documentation
 docs/_build/
 docs/generated/

diff --git a/stable/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip b/stable/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
diff --git a/stable/_downloads/08d879f13e283922a169e8984e0c3b50/plot_oblique_random_forest.ipynb b/stable/_downloads/08d879f13e283922a169e8984e0c3b50/plot_oblique_random_forest.ipynb
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Plot oblique forest and axis-aligned random forest predictions on cc18 datasets\n\nA performance comparison between oblique forest and standard axis-\naligned random forest using three datasets from OpenML benchmarking suites.\n\nTwo of these datasets, namely\n[WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)\nand [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534)\ndatasets consist of 31 features where the former dataset is entirely numeric\nand the latter dataset is entirely norminal. The third dataset, dubbed\n[cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a\nnumeric dataset that has notably large feature space of 857 features. As you\nwill notice, of these three datasets, the oblique forest outperforms axis-aligned\nrandom forest on cnae-9 utilizing sparse random projection mechanism. All datasets\nare subsampled due to computational constraints.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from datetime import datetime\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport seaborn as sns\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import RepeatedKFold, cross_validate\n\nfrom sktree import ObliqueRandomForestClassifier\n\nrandom_state = 123456\nt0 = datetime.now()\ndata_ids = [4534, 1510, 1468]  # openml dataset id\ndf = pd.DataFrame()\n\n\ndef load_cc18(data_id):\n    df = fetch_openml(data_id=data_id, as_frame=True, parser=\"pandas\")\n\n    # extract the dataset name\n    d_name = df.details[\"name\"]\n\n    # Subsampling large datasets\n    if data_id == 1468:\n        n = 100\n    else:\n        n = int(df.frame.shape[0] * 0.8)\n\n    df = df.frame.sample(n, random_state=random_state)\n    X, y = df.iloc[:, :-1], df.iloc[:, -1]\n\n    return X, y, d_name\n\n\ndef get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs):\n    clfs = [RandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)]\n\n    tmp = []\n\n    for i, clf in enumerate(clfs):\n        cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs[\"random_state\"])\n        test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring=\"accuracy\")\n\n        tmp.append(\n            [\n                d_name,\n                [\"RF\", \"OF\"][i],\n                test_score[\"test_score\"],\n                test_score[\"test_score\"].mean(),\n            ]\n        )\n\n    df = pd.DataFrame(\n        tmp, columns=[\"dataset\", \"model\", \"score\", \"mean\"]\n    )  # dtype=[('model',object), ('score',float), ('mean',float)])\n    df = df.explode(\"score\")\n    df[\"score\"] = df[\"score\"].astype(float)\n    df.reset_index(inplace=True, drop=True)\n\n    return df\n\n\nparams = {\n    \"max_features\": None,\n    \"n_estimators\": 50,\n    \"max_depth\": None,\n    \"random_state\": random_state,\n    \"n_cv\": 2,\n    \"n_repeats\": 1,\n}\n\nfor data_id in data_ids:\n    X, y, d_name = load_cc18(data_id=data_id)\n    print(f\"Loading [{d_name}] dataset..\")\n    tmp = get_scores(X=X, y=y, d_name=d_name, **params)\n    df = pd.concat([df, tmp])\n\nprint(f\"It took {(datetime.now()-t0).seconds} seconds to run the script\")\n\n# Draw a comparison plot\nd_names = df.dataset.unique()\nN = d_names.shape[0]\n\nfig, ax = plt.subplots(1, N)\nfig.set_size_inches(6 * N, 6)\n\nfor i, name in enumerate(d_names):\n    sns.stripplot(\n        data=df.query(f'dataset == \"{name}\"'),\n        x=\"model\",\n        y=\"score\",\n        ax=ax[i],\n        dodge=True,\n    )\n    sns.boxplot(\n        data=df.query(f'dataset == \"{name}\"'),\n        x=\"model\",\n        y=\"score\",\n        ax=ax[i],\n        color=\"white\",\n    )\n    ax[i].set_title(name)\n    if i != 0:\n        ax[i].set_ylabel(\"\")\n    ax[i].set_xlabel(\"\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.15"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/stable/_downloads/0a65e88a0e06f6c9a3b9c7f9b8a19554/plot_projection_matrices.py b/stable/_downloads/0a65e88a0e06f6c9a3b9c7f9b8a19554/plot_projection_matrices.py
@@ -0,0 +1,304 @@
+"""
+===============================================
+Plot the projection matrices of an oblique tree
+===============================================
+
+This example shows how projection matrices are generated for an oblique tree,
+specifically the :class:`sktree.tree.PatchObliqueDecisionTreeClassifier`.
+
+For a tree, one can specify the structure of the data that it will be trained on
+(i.e. ``(X, y)``). This is done by specifying the ``data_dims`` parameter. For
+example, if the data is 2D, then ``data_dims`` should be set to ``(n_rows, n_cols)``,
+where now each row of ``X`` is a 1D array of length ``n_rows * n_cols``. If the data
+is 3D, then ``data_dims`` should be set to ``(n_rows, n_cols, n_depth)``, where now
+each row of ``X`` is a 1D array of length ``n_rows * n_cols * n_depth``. This allows
+the tree to be trained on data of any structured dimension, but still be compatible
+with the robust sklearn API.
+
+The projection matrices are used to generate patches of the data. These patches are
+used to calculate the feature values that are used during splitting. The patch is
+generated by sampling a hyperrectangle from the data. The hyperrectangle is defined
+by a starting point and a patch size. The starting point is sampled uniformly from
+the structure of the data. For example, if each row of ``X`` has a 2D image structure
+``(n_rows, n_cols)``, then the starting point will be sampled uniformly from the square
+grid. The patch size is sampled uniformly from the range ``min_patch_dims`` to
+``max_patch_dims``. The patch size is also constrained to be within the bounds of the
+data structure. For example, if the patch size is ``(3, 3)`` and the data structure
+is ``(5, 5)``, then the patch will only sample indices within the data.
+
+We also allow each dimension to be arbitrarily discontiguous.
+
+For details on how to use the hyperparameters related to the patches, see
+:class:`sktree.tree.PatchObliqueDecisionTreeClassifier`.
+"""
+
+import matplotlib.pyplot as plt
+
+# import modules
+# .. note:: We use a private Cython module here to demonstrate what the patches
+#           look like. This is not part of the public API. The Cython module used
+#           is just a Python wrapper for the underlying Cython code and is not the
+#           same as the Cython splitter used in the actual implementation.
+#           To use the actual splitter, one should use the public API for the
+#           relevant tree/forests class.
+import numpy as np
+
+from sktree._lib.sklearn.tree._criterion import Gini
+from sktree.tree.manifold._morf_splitter import BestPatchSplitterTester
+
+# %%
+# Initialize patch splitter
+# -------------------------
+# The patch splitter is used to generate patches for the projection matrices.
+# We will initialize the patch with some dummy values for the sake of this
+# example.
+
+criterion = Gini(1, np.array((0, 1)))
+max_features = 6
+min_samples_leaf = 1
+min_weight_leaf = 0.0
+random_state = np.random.RandomState(100)
+
+boundary = None
+feature_weight = None
+
+# initialize some dummy data
+X = np.repeat(np.arange(25).astype(np.float32), 5).reshape(5, -1)
+y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64)
+sample_weight = np.ones(5)
+
+print("The shape of our dataset is: ", X.shape, y.shape, sample_weight.shape)
+
+# %%
+# Generate 1D patches
+# -------------------
+# Now that we have th patch splitter initialized, we can generate some patches
+# and visualize how they appear on the data. We will make the patch 1D, which
+# samples multiple rows contiguously. This is a 1D patch of size 3.
+min_patch_dims = np.array((1, 1))
+max_patch_dims = np.array((3, 1))
+dim_contiguous = np.array((True, True))
+data_dims = np.array((5, 5))
+
+splitter = BestPatchSplitterTester(
+    criterion,
+    max_features,
+    min_samples_leaf,
+    min_weight_leaf,
+    random_state,
+    min_patch_dims,
+    max_patch_dims,
+    dim_contiguous,
+    data_dims,
+    boundary,
+    feature_weight,
+)
+splitter.init_test(X, y, sample_weight, None)
+
+# sample the projection matrix that consists of 1D patches
+proj_mat = splitter.sample_projection_matrix()
+print(proj_mat.shape)
+
+# Visualize 1D patches
+fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True)
+axs = axs.flatten()
+for idx, ax in enumerate(axs):
+    ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis")
+    ax.set(
+        xlim=(-1, data_dims[1]),
+        ylim=(-1, data_dims[0]),
+        title=f"Patch {idx}",
+    )
+
+fig.suptitle("1D Patch Visualization")
+plt.show()
+
+# %%
+# Generate 2D patches
+# -------------------
+# We will make the patch 2D, which samples multiple rows contiguously. This is
+# a 2D patch of size 3 in the columns and 2 in the rows.
+
+min_patch_dims = np.array((1, 1))
+max_patch_dims = np.array((3, 3))
+dim_contiguous = np.array((True, True))
+data_dims = np.array((5, 5))
+
+splitter = BestPatchSplitterTester(
+    criterion,
+    max_features,
+    min_samples_leaf,
+    min_weight_leaf,
+    random_state,
+    min_patch_dims,
+    max_patch_dims,
+    dim_contiguous,
+    data_dims,
+    boundary,
+    feature_weight,
+)
+splitter.init_test(X, y, sample_weight)
+
+# sample the projection matrix that consists of 1D patches
+proj_mat = splitter.sample_projection_matrix()
+
+# Visualize 2D patches
+fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True)
+axs = axs.flatten()
+for idx, ax in enumerate(axs):
+    ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis")
+    ax.set(
+        xlim=(-1, data_dims[1]),
+        ylim=(-1, data_dims[0]),
+        title=f"Patch {idx}",
+    )
+
+fig.suptitle("2D Patch Visualization")
+plt.show()
+
+# %%
+# Generate 3D patches
+# -------------------
+
+# initialize some dummy data
+X = np.repeat(np.arange(25 * 5).astype(np.float32), 5).reshape(5, -1)
+y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64)
+sample_weight = np.ones(5)
+
+# We will make the patch 3D, which samples multiple rows contiguously. This is
+# a 3D patch of size 3 in the columns and 2 in the rows.
+min_patch_dims = np.array((1, 2, 1))
+max_patch_dims = np.array((3, 2, 4))
+dim_contiguous = np.array((True, True, True))
+data_dims = np.array((5, 5, 5))
+
+splitter = BestPatchSplitterTester(
+    criterion,
+    max_features,
+    min_samples_leaf,
+    min_weight_leaf,
+    random_state,
+    min_patch_dims,
+    max_patch_dims,
+    dim_contiguous,
+    data_dims,
+    boundary,
+    feature_weight,
+)
+splitter.init_test(X, y, sample_weight)
+
+# sample the projection matrix that consists of 1D patches
+proj_mat = splitter.sample_projection_matrix()
+print(proj_mat.shape)
+
+fig = plt.figure()
+for idx in range(3 * 2):
+    ax = fig.add_subplot(2, 3, idx + 1, projection="3d")
+
+    # Plot the surface.
+    z, x, y = proj_mat[idx, :].reshape(data_dims).nonzero()
+    ax.scatter(x, y, z, alpha=1, marker="o", color="black")
+
+    # Customize the z axis.
+    ax.set_zlim(-1.01, data_dims[2])
+    ax.set(
+        xlim=(-1, data_dims[1]),
+        ylim=(-1, data_dims[0]),
+        title=f"Patch {idx}",
+    )
+
+fig.suptitle("3D Patch Visualization")
+plt.show()
+
+
+# %%
+# Discontiguous Patches
+# ---------------------
+# We can also generate patches that are not contiguous. This is useful for
+# analyzing data that is structured, but not necessarily contiguous in certain
+# dimensions. For example, we can generate patches that sample the data in a
+# multivariate time series, where the data consists of ``(n_channels, n_times)``
+# and the patches are discontiguous in the channel dimension, but contiguous
+# in the time dimension. Here, we show an example patch.
+
+# initialize some dummy data
+X = np.repeat(np.arange(25).astype(np.float32), 5).reshape(5, -1)
+y = np.array([0, 0, 0, 1, 1]).reshape(-1, 1).astype(np.float64)
+sample_weight = np.ones(5)
+max_features = 9
+
+# We will make the patch 2D, which samples multiple rows contiguously. This is
+# a 2D patch of size 3 in the columns and 2 in the rows.
+min_patch_dims = np.array((2, 2))
+max_patch_dims = np.array((3, 4))
+dim_contiguous = np.array((False, True))
+data_dims = np.array((5, 5))
+
+splitter = BestPatchSplitterTester(
+    criterion,
+    max_features,
+    min_samples_leaf,
+    min_weight_leaf,
+    random_state,
+    min_patch_dims,
+    max_patch_dims,
+    dim_contiguous,
+    data_dims,
+    boundary,
+    feature_weight,
+)
+splitter.init_test(X, y, sample_weight)
+
+# sample the projection matrix that consists of 1D patches
+proj_mat = splitter.sample_projection_matrix()
+
+# Visualize 2D patches
+fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True)
+axs = axs.flatten()
+for idx, ax in enumerate(axs):
+    ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis")
+    ax.set(
+        xlim=(-1, data_dims[1]),
+        ylim=(-1, data_dims[0]),
+        title=f"Patch {idx}",
+    )
+
+fig.suptitle("2D Discontiguous Patch Visualization")
+plt.show()
+
+# %%
+# We will make the patch 2D, which samples multiple rows contiguously. This is
+# a 2D patch of size 3 in the columns and 2 in the rows.
+dim_contiguous = np.array((False, False))
+
+splitter = BestPatchSplitterTester(
+    criterion,
+    max_features,
+    min_samples_leaf,
+    min_weight_leaf,
+    random_state,
+    min_patch_dims,
+    max_patch_dims,
+    dim_contiguous,
+    data_dims,
+    boundary,
+    feature_weight,
+)
+splitter.init_test(X, y, sample_weight)
+
+# sample the projection matrix that consists of 1D patches
+proj_mat = splitter.sample_projection_matrix()
+
+# Visualize 2D patches
+fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True)
+axs = axs.flatten()
+for idx, ax in enumerate(axs):
+    ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis")
+    ax.set(
+        xlim=(-1, data_dims[1]),
+        ylim=(-1, data_dims[0]),
+        title=f"Patch {idx}",
+    )
+
+fig.suptitle("2D Discontiguous In All Dims Patch Visualization")
+plt.show()