Merge pull request #504 from scikit-learn-contrib/493-add-mondrian-cp

493 add mondrian cp
scikit-learn-contrib · Sep 3, 2024 · dd39a25 · dd39a25
2 parents 603b5da + 12e71f3
commit dd39a25
Show file tree

Hide file tree

Showing 17 changed files with 1,123 additions and 2 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -5,6 +5,7 @@ History
 0.8.x (2024-xx-xx)
 ------------------
 
+* Add Mondrian Conformal Prediction for regression and classification
 * Add `** predict_params` in fit and predict method for Mapie Regression
 * Update the ts-changepoint notebook with the tutorial
 * Change import related to conformity scores into ts-changepoint notebook

diff --git a/doc/Makefile b/doc/Makefile
@@ -52,6 +52,7 @@ clean:
 	-rm -rf examples_classification/
 	-rm -rf examples_multilabel_classification/
 	-rm -rf examples_calibration/
+	-rm -rf examples_mondrian/
 	-rm -rf generated/*
 	-rm -rf modules/generated/*
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -108,3 +108,13 @@ Resampling
 
    subsample.BlockBootstrap
    subsample.Subsample
+
+
+Mondrian
+==========
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   mondrian.MondrianCP
diff --git a/doc/conf.py b/doc/conf.py
@@ -316,13 +316,15 @@
         "../examples/regression",
         "../examples/classification",
         "../examples/multilabel_classification",
-        "../examples/calibration"
+        "../examples/calibration",
+        "../examples/mondrian",
     ],
     "gallery_dirs": [
         "examples_regression",
         "examples_classification",
         "examples_multilabel_classification",
-        "examples_calibration"
+        "examples_calibration",
+        "examples_mondrian",
     ],
     "doc_module": "mapie",
     "backreferences_dir": os.path.join("generated"),

diff --git a/doc/images/mondrian.png b/doc/images/mondrian.png
diff --git a/doc/index.rst b/doc/index.rst
@@ -49,6 +49,14 @@
    examples_multilabel_classification/1-quickstart/plot_tutorial_multilabel_classification
    notebooks_multilabel_classification
 
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: MONDRIAN
+
+   theoretical_description_mondrian
+   examples_mondrian/1-quickstart/plot_main-tutorial-mondrian-regression
+
 .. toctree::
    :maxdepth: 2
    :hidden:

diff --git a/doc/theoretical_description_mondrian.rst b/doc/theoretical_description_mondrian.rst
@@ -0,0 +1,46 @@
+.. title:: Theoretical Description Mondrian : contents
+
+.. _theoretical_description_mondrian:
+
+#######################
+Theoretical Description
+#######################
+
+Mondrian conformal prediction (MCP) [1] is a method that allows to build prediction sets with a group-conditional
+coverage guarantee.  The coverage guarantee is given by:
+
+.. math::
+    P \{Y_{n+1} \in \hat{C}_{n, \alpha}(X_{n+1}) | G_{n+1} = g\} \geq 1 - \alpha
+
+where :math:`G_{n+1}` is the group of the new test point :math:`X_{n+1}` and :math:`g`
+is a group in the set of groups :math:`\mathcal{G}`.
+
+MCP can be used with any split conformal predictor and can be particularly useful when one have a prior 
+knowledge about existing groups wheter the information is directly included in the features
+of the data or not.
+In a classifcation setting, the groups can be defined as the predicted classes of the data. Doing so,
+one can ensure that, for each predicted class, the coverage guarantee is satisfied.
+
+In order to achieve the group-conditional coverage guarantee, MCP simply classifies the data
+according to the groups and then applies the split conformal predictor to each group separately.
+
+The quantile of each group is defined as:
+
+.. math::
+   \widehat{q}^g =Quantile\left(s_1, ..., s_{n^g} ,\frac{\lceil (n^{(g)} + 1)(1-\alpha)\rceil}{n^{(g)}} \right)
+
+Where :math:`s_1, ..., s_{n^g}` are the conformity scores of the training points in group :math:`g` and :math:`n^{(g)}`
+is the number of training points in group :math:`g`.
+
+The following figure (from [1]) explains the process of Mondrian conformal prediction:
+
+.. image:: images/mondrian.png
+   :width: 600
+   :align: center
+
+References
+----------
+
+[1] Vladimir Vovk, David Lindsay, Ilia Nouretdinov, and Alex Gammerman.
+Mondrian confidence machine.
+Technical report, Royal Holloway University of London, 2003
diff --git a/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_13_0.png b/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_13_0.png
diff --git a/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_15_2.png b/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_15_2.png
diff --git a/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_2_0.png b/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_2_0.png
diff --git a/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_5_0.png b/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_5_0.png
diff --git a/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_8_1.png b/doc/tutorial_mondrian_regression_files/tutorial_mondrian_regression_8_1.png
diff --git a/examples/mondrian/1-quickstart/README.rst b/examples/mondrian/1-quickstart/README.rst
@@ -0,0 +1,6 @@
+.. _mondrian_examples_1:
+
+1. Quickstart examples
+----------------------
+
+The following examples present the main functionalities of MAPIE through basic quickstart regression problems.
diff --git a/examples/mondrian/1-quickstart/plot_main-tutorial-mondrian-regression.py b/examples/mondrian/1-quickstart/plot_main-tutorial-mondrian-regression.py
@@ -0,0 +1,185 @@
+r"""
+=============================================
+Tutorial for tabular regression with Mondrian
+=============================================
+
+In this tutorial, we compare the prediction intervals estimated by MAPIE on a
+simple, one-dimensional, ground truth function with classical conformal
+prediction intervals versus Mondrian conformal prediction intervals.
+The function is a sinusoidal function with added noise, and the data is
+grouped in 10 groups. The goal is to estimate the prediction intervals
+for new data points, and to compare the coverage of the prediction intervals
+by groups.
+Throughout this tutorial, we will answer the following questions:
+
+
+- How to use MAPIE to estimate prediction intervals for a regression problem?
+- How to use Mondrian conformal prediction intervals for regression?
+- How to compare the coverage of the prediction intervals by groups?
+"""
+
+import os
+import warnings
+
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestRegressor
+
+from mapie.metrics import regression_coverage_score_v2
+from mapie.mondrian import MondrianCP
+from mapie.regression import MapieRegressor
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+warnings.filterwarnings("ignore")
+
+
+##############################################################################
+# 1. Create the noisy dataset
+# -----------------------------
+# We create a dataset with 10 groups, each of those groups having a different
+# level of noise.
+
+
+n_points = 100000
+np.random.seed(0)
+X = np.linspace(0, 10, n_points).reshape(-1, 1)
+group_size = n_points // 10
+partition_list = []
+for i in range(10):
+    partition_list.append(np.array([i] * group_size))
+partition = np.concatenate(partition_list)
+
+noise_0_1 = np.random.normal(0, 0.1, group_size)
+noise_1_2 = np.random.normal(0, 0.5, group_size)
+noise_2_3 = np.random.normal(0, 1, group_size)
+noise_3_4 = np.random.normal(0, .4, group_size)
+noise_4_5 = np.random.normal(0, .2, group_size)
+noise_5_6 = np.random.normal(0, .3, group_size)
+noise_6_7 = np.random.normal(0, .6, group_size)
+noise_7_8 = np.random.normal(0, .7, group_size)
+noise_8_9 = np.random.normal(0, .8, group_size)
+noise_9_10 = np.random.normal(0, .9, group_size)
+
+y = np.concatenate(
+    [
+        np.sin(X[partition == 0, 0] * 2) + noise_0_1,
+        np.sin(X[partition == 1, 0] * 2) + noise_1_2,
+        np.sin(X[partition == 2, 0] * 2) + noise_2_3,
+        np.sin(X[partition == 3, 0] * 2) + noise_3_4,
+        np.sin(X[partition == 4, 0] * 2) + noise_4_5,
+        np.sin(X[partition == 5, 0] * 2) + noise_5_6,
+        np.sin(X[partition == 6, 0] * 2) + noise_6_7,
+        np.sin(X[partition == 7, 0] * 2) + noise_7_8,
+        np.sin(X[partition == 8, 0] * 2) + noise_8_9,
+        np.sin(X[partition == 9, 0] * 2) + noise_9_10,
+    ], axis=0
+)
+
+
+##############################################################################
+# We plot the dataset with the partition as colors.
+
+
+plt.scatter(X, y, c=partition)
+plt.show()
+
+
+##############################################################################
+# 2. Split the dataset into a training set, a calibration set, and a test set.
+# -----------------------------
+
+X_train_temp, X_test, y_train_temp, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=0
+)
+partition_train_temp, partition_test, _, _ = train_test_split(
+    partition, y, test_size=0.2, random_state=0
+)
+X_cal, X_train, y_cal, y_train = train_test_split(
+    X_train_temp, y_train_temp, test_size=0.5, random_state=0
+)
+partition_cal, partition_train, _, _ = train_test_split(
+    partition_train_temp, y_train_temp, test_size=0.5, random_state=0
+)
+
+
+##############################################################################
+# We plot the training set, the calibration set, and the test set.
+
+
+f, ax = plt.subplots(1, 3, figsize=(15, 5))
+ax[0].scatter(X_train, y_train, c=partition_train)
+ax[0].set_title("Train set")
+ax[1].scatter(X_cal, y_cal, c=partition_cal)
+ax[1].set_title("Calibration set")
+ax[2].scatter(X_test, y_test, c=partition_test)
+ax[2].set_title("Test set")
+plt.show()
+
+
+##############################################################################
+# 3. Fit a random forest regressor on the training set.
+# -----------------------------
+
+rf = RandomForestRegressor(n_estimators=100)
+rf.fit(X_train, y_train)
+
+
+##############################################################################
+# 4. Fit a MapieRegressor and a MondrianCP on the calibration set.
+# -----------------------------
+
+mapie_regressor = MapieRegressor(rf, cv="prefit")
+mondrian_regressor = MondrianCP(MapieRegressor(rf, cv="prefit"))
+mapie_regressor.fit(X_cal, y_cal)
+mondrian_regressor.fit(X_cal, y_cal, partition=partition_cal)
+
+
+##############################################################################
+# 5. Predict the prediction intervals on the test set with both methods.
+# -----------------------------
+
+_, y_pss_split = mapie_regressor.predict(X_test, alpha=.1)
+_, y_pss_mondrian = mondrian_regressor.predict(
+    X_test, partition=partition_test, alpha=.1
+)
+
+
+##############################################################################
+# 6. Compare the coverage by partition, plot both methods side by side.
+# -----------------------------
+
+coverages = {}
+for group in np.unique(partition_test):
+    coverages[group] = {}
+    coverages[group]["split"] = regression_coverage_score_v2(
+        y_test[partition_test == group], y_pss_split[partition_test == group]
+    )
+    coverages[group]["mondrian"] = regression_coverage_score_v2(
+        y_test[partition_test == group],
+        y_pss_mondrian[partition_test == group]
+    )
+
+
+# Plot the coverage by groups, plot both methods side by side
+plt.figure(figsize=(10, 5))
+plt.bar(
+    np.arange(len(coverages)) * 2,
+    [float(coverages[group]["split"]) for group in coverages],
+    label="Split"
+)
+plt.bar(
+    np.arange(len(coverages)) * 2 + 1,
+    [float(coverages[group]["mondrian"]) for group in coverages],
+    label="Mondrian"
+)
+plt.xticks(
+    np.arange(len(coverages)) * 2 + .5,
+    [f"Group {group}" for group in coverages],
+    rotation=45
+)
+plt.hlines(0.9, -1, 21, label="90% coverage", color="black", linestyle="--")
+plt.ylabel("Coverage")
+plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
+plt.tight_layout()
+plt.show()
diff --git a/examples/mondrian/README.rst b/examples/mondrian/README.rst
@@ -0,0 +1,4 @@
+.. _mondrian_examples:
+
+Mondrian examples
+=======================