diff --git a/examples/quantile_predictions/README.txt b/examples/quantile_predictions/README.txt new file mode 100644 index 000000000..7a691a3b3 --- /dev/null +++ b/examples/quantile_predictions/README.txt @@ -0,0 +1,6 @@ +.. _quantile_examples: + +Quantile Predictions with Random Forest +--------------------------------------- + +Examples demonstrating how to generate quantile predictions using Random Forest variants. \ No newline at end of file diff --git a/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py b/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py new file mode 100644 index 000000000..3b1f10324 --- /dev/null +++ b/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py @@ -0,0 +1,111 @@ +""" +======================================================== +Predicting with different quantile interpolation methods +======================================================== + +An example comparison of interpolation methods that can be applied during +prediction when the desired quantile lies between two data points. + +""" + +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +from sklearn.ensemble import RandomForestRegressor + +# %% +# Generate the data +# ----------------- +# We use four simple data points to illustrate the difference between the intervals that are +# generated using different interpolation methods. + +X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) +y = np.array([-2, -1, 0, 1, 2]) + +# %% +# The interpolation methods +# ------------------------- +# The following interpolation methods demonstrated here are: +# To interpolate between the data points, i and j (``i <= j``), +# linear, lower, higher, midpoint, or nearest. For more details, see `sktree.RandomForestRegressor`. +# The difference between the methods can be illustrated with the following example: + +interpolations = ["linear", "lower", "higher", "midpoint", "nearest"] +colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"] +quantiles = [0.025, 0.5, 0.975] + +y_medians = [] +y_errs = [] +est = RandomForestRegressor( + n_estimators=1, + random_state=0, +) +# fit the model +est.fit(X, y) +# get the leaf nodes that each sample fell into +leaf_ids = est.apply(X) +# create a list of dictionary that maps node to samples that fell into it +# for each tree +node_to_indices = [] +for tree in range(leaf_ids.shape[1]): + d = defaultdict(list) + for id, leaf in enumerate(leaf_ids[:, tree]): + d[leaf].append(id) + node_to_indices.append(d) +# drop the X_test to the trained tree and +# get the indices of leaf nodes that fall into it +leaf_ids_test = est.apply(X) +# for each samples, collect the indices of the samples that fell into +# the same leaf node for each tree +y_pred_quantile = [] +for sample in range(leaf_ids_test.shape[0]): + li = [ + node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) + ] + # merge the list of indices into one + idx = [item for sublist in li for item in sublist] + # get the y_train for each corresponding id`` + y_pred_quantile.append(y[idx]) + +for interpolation in interpolations: + # get the quatile preditions for each predicted sample + y_pred = [ + np.array( + [ + np.quantile(y_pred_quantile[i], quantile, method=interpolation) + for i in range(len(y_pred_quantile)) + ] + ) + for quantile in quantiles + ] + y_medians.append(y_pred[1]) + y_errs.append( + np.concatenate( + ( + [y_pred[1] - y_pred[0]], + [y_pred[2] - y_pred[1]], + ), + axis=0, + ) + ) + +sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10) +ebs = [] +for i, (median, y_err) in enumerate(zip(y_medians, y_errs)): + ebs.append( + plt.errorbar( + np.arange(len(y)) + (0.15 * (i + 1)) - 0.35, + median, + yerr=y_err, + color=colors[i], + ecolor=colors[i], + fmt="o", + ) + ) +plt.xlim([-0.75, len(y) - 0.25]) +plt.xticks(np.arange(len(y)), X.tolist()) +plt.xlabel("Samples (Feature Values)") +plt.ylabel("Actual and Predicted Values") +plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2) +plt.show() diff --git a/examples/quantile_predictions/plot_quantile_regression_intervals_with_RF.py b/examples/quantile_predictions/plot_quantile_regression_intervals_with_RF.py new file mode 100644 index 000000000..4fe6c9008 --- /dev/null +++ b/examples/quantile_predictions/plot_quantile_regression_intervals_with_RF.py @@ -0,0 +1,193 @@ +""" +========================================================== +Quantile prediction intervals with Random Forest Regressor +========================================================== + +An example of how to generate quantile prediction intervals with +Random Forest Regressor class on the California Housing dataset. +The plot compares the conditional median with the quantile prediction intervals, i.e. prediction at +quantile parameter being 0.025, 0.5 and 0.975. This allows us to generate predictions at 95% +intervals with upper and lower bounds. + +""" + +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.ticker import FuncFormatter +from sklearn import datasets +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import KFold +from sklearn.utils.validation import check_random_state + +# %% +# Quantile Prediction Function +# ---------------------------- +# +# The following function is used to generate quantile predictions using the samples +# that fall into the same leaf node. We collect the corresponding values for each sample and +# use those as the bases for making quantile predictions. +# The function takes the following arguments: +# 1. estimator :class:`~sklearn.ensemble.RandomForestRegressor` estimator or any other variations. +# 2. X_train : training data to be used to train the tree. +# 3. X_test : testing data to be used to predict the quantiles. +# 4. y_train : training labels to be used to train the tree and to make quantile predictions. +# 5. quantiles : list of quantiles to be predicted. + + +# function to calculate the quantile predictions +def get_quantile_prediction(estimator, X_train, X_test, y_train, quantiles=[0.5]): + estimator.fit(X_train, y_train) + # get the leaf nodes that each sample fell into + leaf_ids = estimator.apply(X_train) + # create a list of dictionary that maps node to samples that fell into it + # for each tree + node_to_indices = [] + for tree in range(leaf_ids.shape[1]): + d = defaultdict(list) + for id, leaf in enumerate(leaf_ids[:, tree]): + d[leaf].append(id) + node_to_indices.append(d) + # drop the X_test to the trained tree and + # get the indices of leaf nodes that fall into it + leaf_ids_test = estimator.apply(X_test) + # for each samples, collect the indices of the samples that fell into + # the same leaf node for each tree + y_pred_quantile = [] + for sample in range(leaf_ids_test.shape[0]): + li = [ + node_to_indices[tree][leaf_ids_test[sample][tree]] + for tree in range(leaf_ids_test.shape[1]) + ] + # merge the list of indices into one + idx = [item for sublist in li for item in sublist] + # get the y_train for each corresponding id`` + y_pred_quantile.append(y_train[idx]) + # get the quatile preditions for each predicted sample + y_preds = [ + [np.quantile(y_pred_quantile[i], quantile) for i in range(len(y_pred_quantile))] + for quantile in quantiles + ] + return y_preds + + +rng = check_random_state(0) + +dollar_formatter = FuncFormatter(lambda x, p: "$" + format(int(x), ",")) + +# %% +# Load the California Housing Prices dataset. + +california = datasets.fetch_california_housing() +n_samples = min(california.target.size, 1000) +perm = rng.permutation(n_samples) +X = california.data[perm] +y = california.target[perm] + +rf = RandomForestRegressor(n_estimators=100, random_state=0) + +kf = KFold(n_splits=5) +kf.get_n_splits(X) + +y_true = [] +y_pred = [] +y_pred_lower = [] +y_pred_upper = [] + +for train_index, test_index in kf.split(X): + X_train, X_test, y_train, y_test = ( + X[train_index], + X[test_index], + y[train_index], + y[test_index], + ) + + rf.set_params(max_features=X_train.shape[1] // 3) + + # Get predictions at 95% prediction intervals and median. + y_pred_i = get_quantile_prediction(rf, X_train, X_test, y_train, quantiles=[0.025, 0.5, 0.975]) + + y_true = np.concatenate((y_true, y_test)) + y_pred = np.concatenate((y_pred, y_pred_i[1])) + y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[0])) + y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[2])) + +# Scale data to dollars. +y_true *= 1e5 +y_pred *= 1e5 +y_pred_lower *= 1e5 +y_pred_upper *= 1e5 + +# %% +# Plot the results +# ---------------- +# Plot the conditional median and prediction intervals. +# The left plot shows the predicted (conditional median) with the confidence intervals at 95% +# against the training data. The upper and lower bounds are indicated with the blue lines segments. +# The right plot shows showed the same prediction sorted by the predicted value and centered at the +# halfway point between the upper and lower bounds. This allows us to see the distribution of the +# confidence intervals, which increases as the variance of the predicted value increases. + +fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 4)) + +y_pred_interval = y_pred_upper - y_pred_lower +sort_idx = np.argsort(y_pred) +y_true = y_true[sort_idx] +y_pred = y_pred[sort_idx] +y_pred_lower = y_pred_lower[sort_idx] +y_pred_upper = y_pred_upper[sort_idx] +y_min = min(np.minimum(y_true, y_pred)) +y_max = max(np.maximum(y_true, y_pred)) +y_min = float(np.round((y_min / 10000) - 1, 0) * 10000) +y_max = float(np.round((y_max / 10000) - 1, 0) * 10000) + +for low, mid, upp in zip(y_pred_lower, y_pred, y_pred_upper): + ax1.plot([mid, mid], [low, upp], lw=4, c="#e0f2ff") + +ax1.plot(y_pred, y_true, c="#f2a619", lw=0, marker=".", ms=5) +ax1.plot(y_pred, y_pred_lower, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4) +ax1.plot(y_pred, y_pred_upper, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4) +ax1.plot([y_min, y_max], [y_min, y_max], ls="--", lw=1, c="grey") +ax1.grid(axis="x", color="0.95") +ax1.grid(axis="y", color="0.95") +ax1.xaxis.set_major_formatter(dollar_formatter) +ax1.yaxis.set_major_formatter(dollar_formatter) +ax1.set_xlim(y_min, y_max) +ax1.set_ylim(y_min, y_max) +ax1.set_xlabel("Fitted Values (Conditional Median)") +ax1.set_ylabel("Observed Values") + +y_pred_interval = y_pred_upper - y_pred_lower +sort_idx = np.argsort(y_pred_interval) +y_true = y_true[sort_idx] +y_pred_lower = y_pred_lower[sort_idx] +y_pred_upper = y_pred_upper[sort_idx] + +# Center data, with the mean of the prediction interval at 0. +mean = (y_pred_lower + y_pred_upper) / 2 +y_true -= mean +y_pred_lower -= mean +y_pred_upper -= mean + +ax2.plot(y_true, c="#f2a619", lw=0, marker=".", ms=5) +ax2.fill_between( + np.arange(len(y_pred_upper)), + y_pred_lower, + y_pred_upper, + alpha=0.8, + color="#e0f2ff", +) +ax2.plot(np.arange(n_samples), y_pred_lower, alpha=0.8, c="#006aff", lw=2) +ax2.plot(np.arange(n_samples), y_pred_upper, alpha=0.8, c="#006aff", lw=2) +ax2.grid(axis="x", color="0.95") +ax2.grid(axis="y", color="0.95") +ax2.yaxis.set_major_formatter(dollar_formatter) +ax2.set_xlim([0, n_samples]) +ax2.set_xlabel("Ordered Samples") +ax2.set_ylabel("Observed Values and Prediction Intervals") + +plt.subplots_adjust(top=0.15) +fig.tight_layout(pad=3) + +plt.show() diff --git a/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py b/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py new file mode 100644 index 000000000..fb98d64d7 --- /dev/null +++ b/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py @@ -0,0 +1,104 @@ +""" +====================================================== +Quantile prediction with Random Forest Regressor class +====================================================== + +An example that demonstrates how to use the Random Forest to generate +quantile predictions such as conditional median and prediction intervals. +The example compares the predictions to a ground truth function used +to generate noisy samples. + +""" + +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import train_test_split + +# %% +# Generate the data + + +def make_toy_dataset(n_samples, seed=0): + rng = np.random.RandomState(seed) + + x = rng.uniform(0, 10, size=n_samples) + f = x * np.sin(x) + + sigma = 0.25 + x / 10 + noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) + y = f + noise + + return np.atleast_2d(x).T, y + + +n_samples = 1000 +X, y = make_toy_dataset(n_samples) + +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + +xx = np.atleast_2d(np.linspace(0, 10, n_samples)).T + + +# %% +# Fit the model to the training samples +# ------------------------------------- + +rf = RandomForestRegressor(max_depth=3, random_state=0) +rf.fit(X_train, y_train) + +y_pred = rf.predict(xx) + +# get the leaf nodes that each sample fell into +leaf_ids = rf.apply(X_train) +# create a list of dictionary that maps node to samples that fell into it +# for each tree +node_to_indices = [] +for tree in range(leaf_ids.shape[1]): + d = defaultdict(list) + for id, leaf in enumerate(leaf_ids[:, tree]): + d[leaf].append(id) + node_to_indices.append(d) +# drop the X_test to the trained tree and +# get the indices of leaf nodes that fall into it +leaf_ids_test = rf.apply(xx) +# for each samples, collect the indices of the samples that fell into +# the same leaf node for each tree +y_pred_quatile = [] +for sample in range(leaf_ids_test.shape[0]): + li = [ + node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) + ] + # merge the list of indices into one + idx = [item for sublist in li for item in sublist] + # get the y_train for each corresponding id + y_pred_quatile.append(y_train[idx]) +# get the quatile preditions for each predicted sample +y_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))] +y_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))] +y_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))] + +# %% +# Plot the results +# ---------------- +# Plot the conditional median and prediction intervals. +# The blue line is the predicted median and the shaded area indicates the 95% confidence interval +# of the prediction. The dots are the training data and the black line indicates the function that +# is used to generated those samples. + +plt.plot(X_test, y_test, ".", c="#f2a619", label="Test Observations", ms=5) +plt.plot(xx, (xx * np.sin(xx)), c="black", label="$f(x) = x\,\sin(x)$", lw=2) +plt.plot(xx, y_pred_med, c="#006aff", label="Predicted Median", lw=3, ms=5) +plt.fill_between( + xx.ravel(), + y_pred_low, + y_pred_upp, + color="#e0f2ff", + label="Predicted 95% Interval", +) +plt.xlabel("$x$") +plt.ylabel("$f(x)$") +plt.legend(loc="upper left") +plt.show() diff --git a/examples/quantile_predictions/plot_quantile_vs_standard_oblique_forest.py b/examples/quantile_predictions/plot_quantile_vs_standard_oblique_forest.py new file mode 100644 index 000000000..7d6231694 --- /dev/null +++ b/examples/quantile_predictions/plot_quantile_vs_standard_oblique_forest.py @@ -0,0 +1,91 @@ +""" +============================================================== +Quantile regression vs. standard and oblique regression forest +============================================================== + +An example to generate quantile predictions using an oblique random forest +instance on a synthetic, right-skewed dataset. + +""" + +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +import scipy as sp +from sklearn.model_selection import train_test_split +from sklearn.utils.validation import check_random_state + +from sktree.ensemble import ObliqueRandomForestRegressor + +rng = check_random_state(0) + +# %% +# Generate the data +# ----------------- +# We use a synthetic dataset with 2 features and 5000 samples. The target is +# generated from a skewed normal distribution. (The mean of the distribution +# is to the right of the median.) + +n_samples = 5000 +a, loc, scale = 5, -1, 1 +skewnorm_rv = sp.stats.skewnorm(a, loc, scale) +skewnorm_rv.random_state = rng +y = skewnorm_rv.rvs(n_samples) +X = rng.randn(n_samples, 2) * y.reshape(-1, 1) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) + +regr_orf = ObliqueRandomForestRegressor(n_estimators=10, random_state=0) + +regr_orf.fit(X_train, y_train) + +y_pred_orf = regr_orf.predict(X_test) +# %% +# Generate Quantile Predictions +# ----------------------------- +# The idea is for each prediction, the training samples that fell into the same leaf nodes +# are collected then used to generate the quantile statistics for the desired prediction. + +# Get the leaf-nodes the training samples fall into +leaf_ids = regr_orf.apply(X_train) +# create a list of dictionary that maps node to samples that fell into it +# for each tree +node_to_indices = [] +for tree in range(leaf_ids.shape[1]): + d = defaultdict(list) + for id, leaf in enumerate(leaf_ids[:, tree]): + d[leaf].append(id) + node_to_indices.append(d) +# drop the X_test to the trained tree and +# get the indices of leaf nodes that fall into it +leaf_ids_test = regr_orf.apply(X_test) +# for each samples, collect the indices of the samples that fell into +# the same leaf node for each tree +y_pred_quantile = [] +for sample in range(leaf_ids_test.shape[0]): + li = [ + node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) + ] + # merge the list of indices into one + idx = [item for sublist in li for item in sublist] + # get the y_train for each corresponding id + y_pred_quantile.append(y_train[idx]) +# get the quatile preditions for each predicted sample +y_pred_quantile = [np.quantile(y_pred_quantile[i], 0.5) for i in range(len(y_pred_quantile))] + +# %% +# Plot the results +# ---------------- +# The plot shows the distribution of the actual target values and the predicted median +# (i.e. 0.5 quantile), and the mean prediction by the regular random forest regressor. +# In this skewed dataset, the median prediction using the quantile method works better at +# predicting the off-centered target distribution than the regular mean prediction. + +colors = ["#c0c0c0", "#a6e5ff", "#e7a4f5"] +names = ["Actual", "QRF (Median)", "ORF (Mean)"] +plt.hist([y_test, y_pred_quantile, y_pred_orf], bins=50, color=colors, label=names) +plt.xlabel("Actual and Predicted Target Values") +plt.ylabel("Counts") +plt.legend() +plt.show()