Add a blocked K-Fold cross-validator (#254)

The `BlockKFold` class splits the data into spatial blocks and does K-fold cross-validation splits on the blocks. Refactor code from `BlockShuffleSplit` into a base class that can be shared with the new class. Folds are balanced by making splits based on the number of data points in each fold instead of the number of blocks.
fatiando · Jun 4, 2020 · 263b29a · 263b29a
1 parent 8bc94e7
commit 263b29a
Show file tree

Hide file tree

Showing 10 changed files with 628 additions and 63 deletions.
diff --git a/doc/api/index.rst b/doc/api/index.rst
@@ -46,6 +46,7 @@ Model Selection
     train_test_split
     cross_val_score
     BlockShuffleSplit
+    BlockKFold
 
 Coordinate Manipulation
 -----------------------
@@ -130,6 +131,7 @@ Base Classes and Functions
    :toctree: generated/
 
     base.BaseGridder
+    base.BaseBlockCrossValidator
     base.n_1d_arrays
     base.check_fit_input
     base.least_squares
diff --git a/examples/blockkfold.py b/examples/blockkfold.py
@@ -0,0 +1,102 @@
+"""
+K-Fold cross-validation with blocks
+===================================
+
+Cross-validation scores for spatial data can be biased because observations are
+commonly spatially autocorrelated (closer data points have similar values). One
+strategy to reduce the bias is to split data along spatial blocks
+[Roberts_etal2017]_. Verde offers the cross-validator
+:class:`verde.BlockKFold`, which is a scikit-learn compatible version of k-fold
+cross-validation using spatial blocks.
+
+When splitting the data into training and testing sets,
+:class:`~verde.BlockKFold` first splits the data into spatial blocks and then
+splits the blocks into folds. During k-fold cross-validation, one fold is used
+as the testing set while the rest are used for training. Since each block can
+have a different number of data points, assigning the same number of blocks to
+each fold can lead to folds with very different numbers of data points.
+By default, :class:`~verde.BlockKFold` takes care to balance the folds to have
+approximately equal number of data points.
+Alternatively, you can turn off balancing to have each fold contain the same
+number of blocks.
+
+This example shows the data assigned to each of the first 3 folds of a blocked
+k-fold iteration, with and without balancing. Notice that the unbalanced folds
+have very different numbers of data points.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import cartopy.crs as ccrs
+import verde as vd
+
+# Let's split the Baja California shipborne bathymetry data
+data = vd.datasets.fetch_baja_bathymetry()
+coordinates = (data.longitude, data.latitude)
+
+# Create cross-validators with blocks of 30 arc-minutes with shuffling enabled.
+spacing = 30 / 60
+# Set the random state so that these plots don't vary when we rerun the example
+random_state = 10
+kfold = vd.BlockKFold(spacing=spacing, shuffle=True, random_state=random_state)
+# Make another cross-validator with fold balancing turned off. Without
+# balancing, the folds can have very different number of data points in them
+# (which may bias the scores).
+kfold_unbalanced = vd.BlockKFold(
+    spacing=spacing, shuffle=True, random_state=random_state, balance=False,
+)
+
+# The BlockKFold is compatible with scikit-learn, so instead of giving it a
+# coordinates tuple (like we use in Verde), we have to put the coordinates in a
+# feature matrix (X in scikit-learn jargon). Each column will have one of the
+# coordinate values. This is usually not required if using this cross-validator
+# with Verde functions and classes. You can pass it to verde.cross_val_score,
+# for example.
+feature_matrix = np.transpose(coordinates)
+
+# Create the folds for the balanced and unbalanced cross-validators to show the
+# difference.
+balanced = kfold.split(feature_matrix)
+unbalanced = kfold_unbalanced.split(feature_matrix)
+
+# Cartopy requires setting the coordinate reference system (CRS) of the
+# original data through the transform argument. Their docs say to use
+# PlateCarree to represent geographic data.
+crs = ccrs.PlateCarree()
+
+# Make Mercator maps of the two cross-validator folds
+fig, axes = plt.subplots(
+    2,
+    3,
+    figsize=(12, 10),
+    subplot_kw=dict(projection=ccrs.Mercator()),
+    sharex=True,
+    sharey=True,
+)
+for row, title, folds in zip(axes, ["Balanced", "Unbalanced"], [balanced, unbalanced]):
+    for i, (ax, fold) in enumerate(zip(row, folds)):
+        train, test = fold
+        ax.set_title("{} fold {} ({} testing points)".format(title, i, test.size))
+        # Use an utility function to setup the tick labels and the land feature
+        vd.datasets.setup_baja_bathymetry_map(ax)
+        ax.plot(
+            coordinates[0][train],
+            coordinates[1][train],
+            ".b",
+            markersize=1,
+            transform=crs,
+            label="Train",
+        )
+        ax.plot(
+            coordinates[0][test],
+            coordinates[1][test],
+            ".r",
+            markersize=1,
+            transform=crs,
+            label="Test",
+        )
+# Place a legend on the first plot
+axes[0, 0].legend(loc="upper right", markerscale=5)
+plt.subplots_adjust(
+    hspace=0.1, wspace=0.05, top=0.95, bottom=0.05, left=0.05, right=0.95
+)
+plt.show()
diff --git a/verde/__init__.py b/verde/__init__.py
@@ -23,7 +23,12 @@
 from .trend import Trend
 from .chain import Chain
 from .spline import Spline, SplineCV
-from .model_selection import cross_val_score, train_test_split, BlockShuffleSplit
+from .model_selection import (
+    cross_val_score,
+    train_test_split,
+    BlockShuffleSplit,
+    BlockKFold,
+)
 from .vector import Vector, VectorSpline2D
 from .projections import project_region, project_grid
 

diff --git a/verde/base/__init__.py b/verde/base/__init__.py
@@ -1,4 +1,4 @@
 # pylint: disable=missing-docstring
-from .base_classes import BaseGridder
+from .base_classes import BaseGridder, BaseBlockCrossValidator
 from .least_squares import least_squares
 from .utils import n_1d_arrays, check_fit_input
diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py
@@ -1,16 +1,131 @@
 """
 Base classes for all gridders.
 """
+from abc import ABCMeta, abstractmethod
+
 import xarray as xr
 import pandas as pd
 import numpy as np
 from sklearn.base import BaseEstimator
+from sklearn.model_selection import BaseCrossValidator
 from sklearn.metrics import r2_score
 
 from ..coordinates import grid_coordinates, profile_coordinates, scatter_points
 from .utils import check_data, check_fit_input
 
 
+# Pylint doesn't like X, y scikit-learn argument names.
+# pylint: disable=invalid-name,unused-argument
+
+
+class BaseBlockCrossValidator(BaseCrossValidator, metaclass=ABCMeta):
+    """
+    Base class for spatially blocked cross-validators.
+
+    Parameters
+    ----------
+    spacing : float, tuple = (s_north, s_east), or None
+        The block size in the South-North and West-East directions,
+        respectively. A single value means that the spacing is equal in both
+        directions. If None, then *shape* **must be provided**.
+    shape : tuple = (n_north, n_east) or None
+        The number of blocks in the South-North and West-East directions,
+        respectively. If None, then *spacing* **must be provided**.
+    n_splits : int
+        Number of splitting iterations.
+
+    """
+
+    def __init__(
+        self, spacing=None, shape=None, n_splits=10,
+    ):
+        if spacing is None and shape is None:
+            raise ValueError("Either 'spacing' or 'shape' must be provided.")
+        self.spacing = spacing
+        self.shape = shape
+        self.n_splits = n_splits
+
+    def split(self, X, y=None, groups=None):
+        """
+        Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, 2)
+            Columns should be the easting and northing coordinates of data
+            points, respectively.
+        y : array-like, shape (n_samples,)
+            The target variable for supervised learning problems. Always
+            ignored.
+        groups : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Always ignored.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+        test : ndarray
+            The testing set indices for that split.
+
+        """
+        if X.shape[1] != 2:
+            raise ValueError(
+                "X must have exactly 2 columns ({} given).".format(X.shape[1])
+            )
+        for train, test in super().split(X, y, groups):
+            yield train, test
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """
+        Returns the number of splitting iterations in the cross-validator
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+        y : object
+            Always ignored, exists for compatibility.
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_splits
+
+    @abstractmethod
+    def _iter_test_indices(self, X=None, y=None, groups=None):
+        """
+        Generates integer indices corresponding to test sets.
+
+        MUST BE IMPLEMENTED BY DERIVED CLASSES.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, 2)
+            Columns should be the easting and northing coordinates of data
+            points, respectively.
+        y : array-like, shape (n_samples,)
+            The target variable for supervised learning problems. Always
+            ignored.
+        groups : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Always ignored.
+
+        Yields
+        ------
+        test : ndarray
+            The testing set indices for that split.
+
+        """
+
+
+# pylint: enable=invalid-name,unused-argument
+
+
 class BaseGridder(BaseEstimator):
     """
     Base class for gridders.