Skip to content

Commit

Permalink
Add a blocked K-Fold cross-validator (#254)
Browse files Browse the repository at this point in the history
The `BlockKFold` class splits the data into spatial blocks and does
K-fold cross-validation splits on the blocks. Refactor code from 
`BlockShuffleSplit` into a base class that can be shared with the
new class. Folds are balanced by making splits based on the number
of data points in each fold instead of the number of blocks.
  • Loading branch information
leouieda authored Jun 4, 2020
1 parent 8bc94e7 commit 263b29a
Show file tree
Hide file tree
Showing 10 changed files with 628 additions and 63 deletions.
2 changes: 2 additions & 0 deletions doc/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Model Selection
train_test_split
cross_val_score
BlockShuffleSplit
BlockKFold

Coordinate Manipulation
-----------------------
Expand Down Expand Up @@ -130,6 +131,7 @@ Base Classes and Functions
:toctree: generated/

base.BaseGridder
base.BaseBlockCrossValidator
base.n_1d_arrays
base.check_fit_input
base.least_squares
102 changes: 102 additions & 0 deletions examples/blockkfold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
K-Fold cross-validation with blocks
===================================
Cross-validation scores for spatial data can be biased because observations are
commonly spatially autocorrelated (closer data points have similar values). One
strategy to reduce the bias is to split data along spatial blocks
[Roberts_etal2017]_. Verde offers the cross-validator
:class:`verde.BlockKFold`, which is a scikit-learn compatible version of k-fold
cross-validation using spatial blocks.
When splitting the data into training and testing sets,
:class:`~verde.BlockKFold` first splits the data into spatial blocks and then
splits the blocks into folds. During k-fold cross-validation, one fold is used
as the testing set while the rest are used for training. Since each block can
have a different number of data points, assigning the same number of blocks to
each fold can lead to folds with very different numbers of data points.
By default, :class:`~verde.BlockKFold` takes care to balance the folds to have
approximately equal number of data points.
Alternatively, you can turn off balancing to have each fold contain the same
number of blocks.
This example shows the data assigned to each of the first 3 folds of a blocked
k-fold iteration, with and without balancing. Notice that the unbalanced folds
have very different numbers of data points.
"""
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import verde as vd

# Let's split the Baja California shipborne bathymetry data
data = vd.datasets.fetch_baja_bathymetry()
coordinates = (data.longitude, data.latitude)

# Create cross-validators with blocks of 30 arc-minutes with shuffling enabled.
spacing = 30 / 60
# Set the random state so that these plots don't vary when we rerun the example
random_state = 10
kfold = vd.BlockKFold(spacing=spacing, shuffle=True, random_state=random_state)
# Make another cross-validator with fold balancing turned off. Without
# balancing, the folds can have very different number of data points in them
# (which may bias the scores).
kfold_unbalanced = vd.BlockKFold(
spacing=spacing, shuffle=True, random_state=random_state, balance=False,
)

# The BlockKFold is compatible with scikit-learn, so instead of giving it a
# coordinates tuple (like we use in Verde), we have to put the coordinates in a
# feature matrix (X in scikit-learn jargon). Each column will have one of the
# coordinate values. This is usually not required if using this cross-validator
# with Verde functions and classes. You can pass it to verde.cross_val_score,
# for example.
feature_matrix = np.transpose(coordinates)

# Create the folds for the balanced and unbalanced cross-validators to show the
# difference.
balanced = kfold.split(feature_matrix)
unbalanced = kfold_unbalanced.split(feature_matrix)

# Cartopy requires setting the coordinate reference system (CRS) of the
# original data through the transform argument. Their docs say to use
# PlateCarree to represent geographic data.
crs = ccrs.PlateCarree()

# Make Mercator maps of the two cross-validator folds
fig, axes = plt.subplots(
2,
3,
figsize=(12, 10),
subplot_kw=dict(projection=ccrs.Mercator()),
sharex=True,
sharey=True,
)
for row, title, folds in zip(axes, ["Balanced", "Unbalanced"], [balanced, unbalanced]):
for i, (ax, fold) in enumerate(zip(row, folds)):
train, test = fold
ax.set_title("{} fold {} ({} testing points)".format(title, i, test.size))
# Use an utility function to setup the tick labels and the land feature
vd.datasets.setup_baja_bathymetry_map(ax)
ax.plot(
coordinates[0][train],
coordinates[1][train],
".b",
markersize=1,
transform=crs,
label="Train",
)
ax.plot(
coordinates[0][test],
coordinates[1][test],
".r",
markersize=1,
transform=crs,
label="Test",
)
# Place a legend on the first plot
axes[0, 0].legend(loc="upper right", markerscale=5)
plt.subplots_adjust(
hspace=0.1, wspace=0.05, top=0.95, bottom=0.05, left=0.05, right=0.95
)
plt.show()
7 changes: 6 additions & 1 deletion verde/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
from .trend import Trend
from .chain import Chain
from .spline import Spline, SplineCV
from .model_selection import cross_val_score, train_test_split, BlockShuffleSplit
from .model_selection import (
cross_val_score,
train_test_split,
BlockShuffleSplit,
BlockKFold,
)
from .vector import Vector, VectorSpline2D
from .projections import project_region, project_grid

Expand Down
2 changes: 1 addition & 1 deletion verde/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# pylint: disable=missing-docstring
from .base_classes import BaseGridder
from .base_classes import BaseGridder, BaseBlockCrossValidator
from .least_squares import least_squares
from .utils import n_1d_arrays, check_fit_input
115 changes: 115 additions & 0 deletions verde/base/base_classes.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,131 @@
"""
Base classes for all gridders.
"""
from abc import ABCMeta, abstractmethod

import xarray as xr
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import r2_score

from ..coordinates import grid_coordinates, profile_coordinates, scatter_points
from .utils import check_data, check_fit_input


# Pylint doesn't like X, y scikit-learn argument names.
# pylint: disable=invalid-name,unused-argument


class BaseBlockCrossValidator(BaseCrossValidator, metaclass=ABCMeta):
"""
Base class for spatially blocked cross-validators.
Parameters
----------
spacing : float, tuple = (s_north, s_east), or None
The block size in the South-North and West-East directions,
respectively. A single value means that the spacing is equal in both
directions. If None, then *shape* **must be provided**.
shape : tuple = (n_north, n_east) or None
The number of blocks in the South-North and West-East directions,
respectively. If None, then *spacing* **must be provided**.
n_splits : int
Number of splitting iterations.
"""

def __init__(
self, spacing=None, shape=None, n_splits=10,
):
if spacing is None and shape is None:
raise ValueError("Either 'spacing' or 'shape' must be provided.")
self.spacing = spacing
self.shape = shape
self.n_splits = n_splits

def split(self, X, y=None, groups=None):
"""
Generate indices to split data into training and test set.
Parameters
----------
X : array-like, shape (n_samples, 2)
Columns should be the easting and northing coordinates of data
points, respectively.
y : array-like, shape (n_samples,)
The target variable for supervised learning problems. Always
ignored.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Always ignored.
Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
"""
if X.shape[1] != 2:
raise ValueError(
"X must have exactly 2 columns ({} given).".format(X.shape[1])
)
for train, test in super().split(X, y, groups):
yield train, test

def get_n_splits(self, X=None, y=None, groups=None):
"""
Returns the number of splitting iterations in the cross-validator
Parameters
----------
X : object
Always ignored, exists for compatibility.
y : object
Always ignored, exists for compatibility.
groups : object
Always ignored, exists for compatibility.
Returns
-------
n_splits : int
Returns the number of splitting iterations in the cross-validator.
"""
return self.n_splits

@abstractmethod
def _iter_test_indices(self, X=None, y=None, groups=None):
"""
Generates integer indices corresponding to test sets.
MUST BE IMPLEMENTED BY DERIVED CLASSES.
Parameters
----------
X : array-like, shape (n_samples, 2)
Columns should be the easting and northing coordinates of data
points, respectively.
y : array-like, shape (n_samples,)
The target variable for supervised learning problems. Always
ignored.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Always ignored.
Yields
------
test : ndarray
The testing set indices for that split.
"""


# pylint: enable=invalid-name,unused-argument


class BaseGridder(BaseEstimator):
"""
Base class for gridders.
Expand Down
Loading

0 comments on commit 263b29a

Please sign in to comment.