neurodata · adam2392 · Oct 24, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst
@@ -25,4 +25,4 @@ the project since version inception, including:
 
 * `Adam Li`_
 * `Sambit Panda`_
-* `Yuxin Bai`_
+
diff --git a/examples/quantile_predictions/README.txt b/examples/quantile_predictions/README.txt
@@ -0,0 +1,6 @@
+.. _quantile_examples:
+
+Quantile Predictions with Random Forest
+---------------------------------------
+
+Examples demonstrating how to generate quantile predictions using RandomForest class.
diff --git a/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py b/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py
@@ -0,0 +1,98 @@
+"""
+========================================================
+Predicting with different quantile interpolation methods
+========================================================
+
+An example comparison of interpolation methods that can be applied during
+prediction when the desired quantile lies between two data points.
+
+"""
+
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.ensemble import RandomForestRegressor
+
+# Create toy dataset.
+X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
+y = np.array([-2, -1, 0, 1, 2])
+
+interpolations = ["linear", "lower", "higher", "midpoint", "nearest"]
+colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"]
+quantiles = [0.025, 0.5, 0.975]
+
+y_medians = []
+y_errs = []
+est = RandomForestRegressor(
+    n_estimators=1,
+    random_state=0,
+)
+# fit the model
+est.fit(X, y)
+# get the leaf nodes that each sample fell into
+leaf_ids = est.apply(X)
+# create a list of dictionary that maps node to samples that fell into it
+# for each tree
+node_to_indices = []
+for tree in range(leaf_ids.shape[1]):
+    d = defaultdict(list)
+    for id, leaf in enumerate(leaf_ids[:, tree]):
+        d[leaf].append(id)
+    node_to_indices.append(d)
+# drop the X_test to the trained tree and
+# get the indices of leaf nodes that fall into it
+leaf_ids_test = est.apply(X)
+# for each samples, collect the indices of the samples that fell into
+# the same leaf node for each tree
+y_pred_quantile = []
+for sample in range(leaf_ids_test.shape[0]):
+    li = [
+        node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
+    ]
+    # merge the list of indices into one
+    idx = [item for sublist in li for item in sublist]
+    # get the y_train for each corresponding id``
+    y_pred_quantile.append(y[idx])
+
+for interpolation in interpolations:
+    # get the quatile preditions for each predicted sample
+    y_pred = [
+        np.array(
+            [
+                np.quantile(y_pred_quantile[i], quantile, method=interpolation)
+                for i in range(len(y_pred_quantile))
+            ]
+        )
+        for quantile in quantiles
+    ]
+    y_medians.append(y_pred[1])
+    y_errs.append(
+        np.concatenate(
+            (
+                [y_pred[1] - y_pred[0]],
+                [y_pred[2] - y_pred[1]],
+            ),
+            axis=0,
+        )
+    )
+
+sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10)
+ebs = []
+for i, (median, y_err) in enumerate(zip(y_medians, y_errs)):
+    ebs.append(
+        plt.errorbar(
+            np.arange(len(y)) + (0.15 * (i + 1)) - 0.35,
+            median,
+            yerr=y_err,
+            color=colors[i],
+            ecolor=colors[i],
+            fmt="o",
+        )
+    )
+plt.xlim([-0.75, len(y) - 0.25])
+plt.xticks(np.arange(len(y)), X.tolist())
+plt.xlabel("Samples (Feature Values)")
+plt.ylabel("Actual and Predicted Values")
+plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2)
+plt.show()
diff --git a/examples/quantile_predictions/plot_quantile_regression_intervals_with_RF.py b/examples/quantile_predictions/plot_quantile_regression_intervals_with_RF.py
@@ -0,0 +1,164 @@
+"""
+==========================================================
+Quantile prediction intervals with Random Forest Regressor
+==========================================================
+
+An example of how to generate quantile prediction intervals with
+Random Forest Regressor class on the California Housing dataset.
+
+"""
+
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.ticker import FuncFormatter
+from sklearn import datasets
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import KFold
+from sklearn.utils.validation import check_random_state
+
+
+# function to calculate the quantile predictions
+def get_quantile_prediction(estimator, X_train, X_test, y_train, quantiles=[0.5]):
+    estimator.fit(X_train, y_train)
+    # get the leaf nodes that each sample fell into
+    leaf_ids = estimator.apply(X_train)
+    # create a list of dictionary that maps node to samples that fell into it
+    # for each tree
+    node_to_indices = []
+    for tree in range(leaf_ids.shape[1]):
+        d = defaultdict(list)
+        for id, leaf in enumerate(leaf_ids[:, tree]):
+            d[leaf].append(id)
+        node_to_indices.append(d)
+    # drop the X_test to the trained tree and
+    # get the indices of leaf nodes that fall into it
+    leaf_ids_test = estimator.apply(X_test)
+    # for each samples, collect the indices of the samples that fell into
+    # the same leaf node for each tree
+    y_pred_quantile = []
+    for sample in range(leaf_ids_test.shape[0]):
+        li = [
+            node_to_indices[tree][leaf_ids_test[sample][tree]]
+            for tree in range(leaf_ids_test.shape[1])
+        ]
+        # merge the list of indices into one
+        idx = [item for sublist in li for item in sublist]
+        # get the y_train for each corresponding id``
+        y_pred_quantile.append(y_train[idx])
+    # get the quatile preditions for each predicted sample
+    y_preds = [
+        [np.quantile(y_pred_quantile[i], quantile) for i in range(len(y_pred_quantile))]
+        for quantile in quantiles
+    ]
+    return y_preds
+
+
+rng = check_random_state(0)
+
+dollar_formatter = FuncFormatter(lambda x, p: "$" + format(int(x), ","))
+
+# Load the California Housing Prices dataset.
+california = datasets.fetch_california_housing()
+n_samples = min(california.target.size, 1000)
+perm = rng.permutation(n_samples)
+X = california.data[perm]
+y = california.target[perm]
+
+rf = RandomForestRegressor(n_estimators=100, random_state=0)
+
+kf = KFold(n_splits=5)
+kf.get_n_splits(X)
+
+y_true = []
+y_pred = []
+y_pred_lower = []
+y_pred_upper = []
+
+for train_index, test_index in kf.split(X):
+    X_train, X_test, y_train, y_test = (
+        X[train_index],
+        X[test_index],
+        y[train_index],
+        y[test_index],
+    )
+
+    rf.set_params(max_features=X_train.shape[1] // 3)
+
+    # Get predictions at 95% prediction intervals and median.
+    y_pred_i = get_quantile_prediction(rf, X_train, X_test, y_train, quantiles=[0.025, 0.5, 0.975])
+
+    y_true = np.concatenate((y_true, y_test))
+    y_pred = np.concatenate((y_pred, y_pred_i[1]))
+    y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[0]))
+    y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[2]))
+
+# Scale data to dollars.
+y_true *= 1e5
+y_pred *= 1e5
+y_pred_lower *= 1e5
+y_pred_upper *= 1e5
+
+fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
+
+y_pred_interval = y_pred_upper - y_pred_lower
+sort_idx = np.argsort(y_pred)
+y_true = y_true[sort_idx]
+y_pred = y_pred[sort_idx]
+y_pred_lower = y_pred_lower[sort_idx]
+y_pred_upper = y_pred_upper[sort_idx]
+y_min = min(np.minimum(y_true, y_pred))
+y_max = max(np.maximum(y_true, y_pred))
+y_min = float(np.round((y_min / 10000) - 1, 0) * 10000)
+y_max = float(np.round((y_max / 10000) - 1, 0) * 10000)
+
+for low, mid, upp in zip(y_pred_lower, y_pred, y_pred_upper):
+    ax1.plot([mid, mid], [low, upp], lw=4, c="#e0f2ff")
+
+ax1.plot(y_pred, y_true, c="#f2a619", lw=0, marker=".", ms=5)
+ax1.plot(y_pred, y_pred_lower, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4)
+ax1.plot(y_pred, y_pred_upper, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4)
+ax1.plot([y_min, y_max], [y_min, y_max], ls="--", lw=1, c="grey")
+ax1.grid(axis="x", color="0.95")
+ax1.grid(axis="y", color="0.95")
+ax1.xaxis.set_major_formatter(dollar_formatter)
+ax1.yaxis.set_major_formatter(dollar_formatter)
+ax1.set_xlim(y_min, y_max)
+ax1.set_ylim(y_min, y_max)
+ax1.set_xlabel("Fitted Values (Conditional Median)")
+ax1.set_ylabel("Observed Values")
+
+y_pred_interval = y_pred_upper - y_pred_lower
+sort_idx = np.argsort(y_pred_interval)
+y_true = y_true[sort_idx]
+y_pred_lower = y_pred_lower[sort_idx]
+y_pred_upper = y_pred_upper[sort_idx]
+
+# Center data, with the mean of the prediction interval at 0.
+mean = (y_pred_lower + y_pred_upper) / 2
+y_true -= mean
+y_pred_lower -= mean
+y_pred_upper -= mean
+
+ax2.plot(y_true, c="#f2a619", lw=0, marker=".", ms=5)
+ax2.fill_between(
+    np.arange(len(y_pred_upper)),
+    y_pred_lower,
+    y_pred_upper,
+    alpha=0.8,
+    color="#e0f2ff",
+)
+ax2.plot(np.arange(n_samples), y_pred_lower, alpha=0.8, c="#006aff", lw=2)
+ax2.plot(np.arange(n_samples), y_pred_upper, alpha=0.8, c="#006aff", lw=2)
+ax2.grid(axis="x", color="0.95")
+ax2.grid(axis="y", color="0.95")
+ax2.yaxis.set_major_formatter(dollar_formatter)
+ax2.set_xlim([0, n_samples])
+ax2.set_xlabel("Ordered Samples")
+ax2.set_ylabel("Observed Values and Prediction Intervals")
+
+plt.subplots_adjust(top=0.15)
+fig.tight_layout(pad=3)
+
+plt.show()
diff --git a/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py b/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py
@@ -0,0 +1,88 @@
+"""
+======================================================
+Quantile prediction with Random Forest Regressor class
+======================================================
+
+An example that demonstrates how to use the Random Forest to generate
+quantile predictions such as conditional median and prediction intervals.
+The example compares the predictions to a ground truth function used
+to generate noisy samples.
+
+"""
+
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+
+
+def make_toy_dataset(n_samples, seed=0):
+    rng = np.random.RandomState(seed)
+
+    x = rng.uniform(0, 10, size=n_samples)
+    f = x * np.sin(x)
+
+    sigma = 0.25 + x / 10
+    noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
+    y = f + noise
+
+    return np.atleast_2d(x).T, y
+
+
+n_samples = 1000
+X, y = make_toy_dataset(n_samples)
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+xx = np.atleast_2d(np.linspace(0, 10, n_samples)).T
+
+rf = RandomForestRegressor(max_depth=3, random_state=0)
+rf.fit(X_train, y_train)
+
+y_pred = rf.predict(xx)
+
+# get the leaf nodes that each sample fell into
+leaf_ids = rf.apply(X_train)
+# create a list of dictionary that maps node to samples that fell into it
+# for each tree
+node_to_indices = []
+for tree in range(leaf_ids.shape[1]):
+    d = defaultdict(list)
+    for id, leaf in enumerate(leaf_ids[:, tree]):
+        d[leaf].append(id)
+    node_to_indices.append(d)
+# drop the X_test to the trained tree and
+# get the indices of leaf nodes that fall into it
+leaf_ids_test = rf.apply(xx)
+# for each samples, collect the indices of the samples that fell into
+# the same leaf node for each tree
+y_pred_quatile = []
+for sample in range(leaf_ids_test.shape[0]):
+    li = [
+        node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
+    ]
+    # merge the list of indices into one
+    idx = [item for sublist in li for item in sublist]
+    # get the y_train for each corresponding id
+    y_pred_quatile.append(y_train[idx])
+# get the quatile preditions for each predicted sample
+y_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))]
+y_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))]
+y_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))]
+
+plt.plot(X_test, y_test, ".", c="#f2a619", label="Test Observations", ms=5)
+plt.plot(xx, (xx * np.sin(xx)), c="black", label="$f(x) = x\,\sin(x)$", lw=2)
+plt.plot(xx, y_pred_med, c="#006aff", label="Predicted Median", lw=3, ms=5)
+plt.fill_between(
+    xx.ravel(),
+    y_pred_low,
+    y_pred_upp,
+    color="#e0f2ff",
+    label="Predicted 95% Interval",
+)
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
+plt.legend(loc="upper left")
+plt.show()