Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: Standardize method applied on individual variables in OTC/dOTC #1896

Merged
merged 7 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@ Changelog

v0.53.0 (unreleased)
--------------------
Contributors to this version: Adrien Lamarche (:user:`LamAdr`), Trevor James Smith (:user:`Zeitsperre`).
Contributors to this version: Adrien Lamarche (:user:`LamAdr`), Trevor James Smith (:user:`Zeitsperre`), Éric Dupuis (:user:`coxipi`).

Bug fixes
^^^^^^^^^
* Fixed a small inefficiency in ``_otc_adjust`` (:pull:`1890`).
* Fixed a small inefficiency in ``_otc_adjust``, and the `standardize` method of `OTC/dOTC` is now applied on individual variable (:pull:`1890`, :pull:`1896`).

Breaking changes
^^^^^^^^^^^^^^^^
* `transform` argument of `OTC/dOTC` classes (and child functions) is changed to `normalization`, and `numIterMax` is changed to `num_iter_max` in `utils.optimal_transport` (:pull:`1896`).

Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions docs/notebooks/sdba.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This last plot shows the correlation between input and output per variable. Here we see a relatively strong correlation for all variables, meaning they are all taken into account when finding the optimal transport mappings. This is because we're using the (by default) `transform = 'max_distance'` argument. Were the data not transformed, the distances along the precipitation dimension would be very small relative to the temperature distances. Precipitation values would then be spread around at very low cost and have virtually no effect on the result. See this in action with `transform = None`.\n",
"This last plot shows the correlation between input and output per variable. Here we see a relatively strong correlation for all variables, meaning they are all taken into account when finding the optimal transport mappings. This is because we're using the (by default) `normalization = 'max_distance'` argument. Were the data not normalized, the distances along the precipitation dimension would be very small relative to the temperature distances. Precipitation values would then be spread around at very low cost and have virtually no effect on the result. See this in action with `normalization = None`.\n",
"\n",
"The chunks we see in the tasmax data are artefacts of the `bin_width`."
]
Expand Down Expand Up @@ -808,7 +808,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.12.4"
},
"toc": {
"base_numbering": 1,
Expand Down
2 changes: 2 additions & 0 deletions tests/test_sdba/test_adjustment.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,9 +903,11 @@ def test_shape(self, random, series):
assert not np.isin(hist.x[hist.x.isnull()].time.values, scen.time.values).any()


# TODO: Add tests for normalization methods
class TestdOTC:
@pytest.mark.parametrize("use_dask", [True, False])
@pytest.mark.parametrize("cov_factor", ["std", "cholesky"])
# FIXME: Should this comparison not fail if `standardization` != `None`?
def test_compare_sbck(self, random, series, use_dask, cov_factor):
pytest.importorskip("ot")
pytest.importorskip("SBCK", minversion="0.4.0")
Expand Down
40 changes: 22 additions & 18 deletions xclim/sdba/_adjustment.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,7 @@ def _otc_adjust(
bin_origin: dict | float | np.ndarray | None = None,
num_iter_max: int | None = 100_000_000,
jitter_inside_bins: bool = True,
transform: str | None = "max_distance",
normalization: str | None = "max_distance",
):
"""Optimal Transport Correction of the bias of X with respect to Y.

Expand All @@ -972,8 +972,9 @@ def _otc_adjust(
jitter_inside_bins : bool
If `False`, output points are located at the center of their bin.
If `True`, a random location is picked uniformly inside their bin. Default is `True`.
transform : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated.
normalization : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated
in the optimal transport.

Returns
-------
Expand Down Expand Up @@ -1013,7 +1014,7 @@ def _otc_adjust(
gridY, muY, _ = u.histogram(Y, bin_width, bin_origin)

# Compute the optimal transportation plan
plan = u.optimal_transport(gridX, gridY, muX, muY, num_iter_max, transform)
plan = u.optimal_transport(gridX, gridY, muX, muY, num_iter_max, normalization)

gridX = np.floor((gridX - bin_origin) / bin_width)
gridY = np.floor((gridY - bin_origin) / bin_width)
Expand Down Expand Up @@ -1051,7 +1052,7 @@ def otc_adjust(
num_iter_max: int | None = 100_000_000,
jitter_inside_bins: bool = True,
adapt_freq_thresh: dict | None = None,
transform: str | None = "max_distance",
normalization: str | None = "max_distance",
):
"""Optimal Transport Correction of the bias of `hist` with respect to `ref`.

Expand All @@ -1076,8 +1077,9 @@ def otc_adjust(
If `True`, a random location is picked uniformly inside their bin. Default is `True`.
adapt_freq_thresh : dict, optional
Threshold for frequency adaptation per variable.
transform : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated.
normalization : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated
in the optimal transport.

Returns
-------
Expand Down Expand Up @@ -1121,7 +1123,7 @@ def otc_adjust(
"bin_origin": bin_origin,
"num_iter_max": num_iter_max,
"jitter_inside_bins": jitter_inside_bins,
"transform": transform,
"normalization": normalization,
},
input_core_dims=[["dim_hist", pts_dim], ["dim_ref", pts_dim]],
output_core_dims=[["dim_hist", pts_dim]],
Expand Down Expand Up @@ -1149,7 +1151,7 @@ def _dotc_adjust(
cov_factor: str | None = "std",
jitter_inside_bins: bool = True,
kind: dict | None = None,
transform: str | None = "max_distance",
normalization: str | None = "max_distance",
):
"""Dynamical Optimal Transport Correction of the bias of X with respect to Y.

Expand All @@ -1175,8 +1177,9 @@ def _dotc_adjust(
kind : dict, optional
Keys are variable names and values are adjustment kinds, either additive or multiplicative.
Unspecified dimensions are treated as "+".
transform : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated.
normalization : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated
in the optimal transport.

Returns
-------
Expand Down Expand Up @@ -1212,7 +1215,7 @@ def _dotc_adjust(
bin_origin=bin_origin,
num_iter_max=num_iter_max,
jitter_inside_bins=False,
transform=transform,
normalization=normalization,
)

# Map hist to sim
Expand All @@ -1223,7 +1226,7 @@ def _dotc_adjust(
bin_origin=bin_origin,
num_iter_max=num_iter_max,
jitter_inside_bins=False,
transform=transform,
normalization=normalization,
)

# Temporal evolution
Expand Down Expand Up @@ -1260,7 +1263,7 @@ def _dotc_adjust(
bin_origin=bin_origin,
num_iter_max=num_iter_max,
jitter_inside_bins=jitter_inside_bins,
transform=transform,
normalization=normalization,
)

return Z1
Expand All @@ -1278,7 +1281,7 @@ def dotc_adjust(
jitter_inside_bins: bool = True,
kind: dict | None = None,
adapt_freq_thresh: dict | None = None,
transform: str | None = "max_distance",
normalization: str | None = "max_distance",
):
"""Dynamical Optimal Transport Correction of the bias of X with respect to Y.

Expand Down Expand Up @@ -1309,8 +1312,9 @@ def dotc_adjust(
Unspecified dimensions are treated as "+".
adapt_freq_thresh : dict, optional
Threshold for frequency adaptation per variable.
transform : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated.
normalization : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated
in the optimal transport.

Returns
-------
Expand Down Expand Up @@ -1368,7 +1372,7 @@ def dotc_adjust(
"cov_factor": cov_factor,
"jitter_inside_bins": jitter_inside_bins,
"kind": kind,
"transform": transform,
"normalization": normalization,
},
input_core_dims=[
["dim_sim", pts_dim],
Expand Down
28 changes: 14 additions & 14 deletions xclim/sdba/adjustment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1312,7 +1312,7 @@ class OTC(Adjust):
See :py:class:`xclim.sdba.processing.adapt_freq` for details.
Frequency adaptation is not applied to missing variables if is dict.
Applied to all variables if is string.
transform : {None, 'standardize', 'max_distance', 'max_value'}
normalization : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated.
Default is "max_distance".
See notes for details.
Expand Down Expand Up @@ -1343,12 +1343,12 @@ class OTC(Adjust):
with probability :math:`P_{ij}`. A transformation of bin positions can be applied before computing the distances :math:`C_{ij}`
to make variables on different scales more evenly taken into consideration by the optimization step. Available transformations are

- `transform = 'standardize'` :
- `normalization = 'standardize'` :
.. math::

i_v' = \frac{i_v - mean(i_v)}{std(i_v)} \quad\quad\quad j_v' = \frac{j_v - mean(j_v)}{std(j_v)}

- `transform = 'max_distance'` :
- `normalization = 'max_distance'` :
.. math::

i_v' = \frac{i_v}{max \{|i_v - j_v|\}} \quad\quad\quad j_v' = \frac{j_v}{max \{|i_v - j_v|\}}
Expand All @@ -1358,7 +1358,7 @@ class OTC(Adjust):

max \{|i_v' - j_v'|\} = max \{|i_w' - j_w'|\} = 1

- `transform = 'max_value'` :
- `normalization = 'max_value'` :
.. math::

i_v' = \frac{i_v}{max\{i_v\}} \quad\quad\quad j_v' = \frac{j_v}{max\{j_v\}}
Expand Down Expand Up @@ -1394,7 +1394,7 @@ def _adjust(
num_iter_max: int | None = 100_000_000,
jitter_inside_bins: bool = True,
adapt_freq_thresh: dict | str | None = None,
transform: str | None = "max_distance",
normalization: str | None = "max_distance",
group: str | Grouper = "time",
pts_dim: str = "multivar",
**kwargs,
Expand All @@ -1404,7 +1404,7 @@ def _adjust(
"POT is required for OTC and dOTC. Please install with `pip install POT`."
)

if transform not in [None, "standardize", "max_distance", "max_value"]:
if normalization not in [None, "standardize", "max_distance", "max_value"]:
raise ValueError(
"`transform` should be in [None, 'standardize', 'max_distance', 'max_value']."
)
Expand All @@ -1429,7 +1429,7 @@ def _adjust(
num_iter_max=num_iter_max,
jitter_inside_bins=jitter_inside_bins,
adapt_freq_thresh=adapt_freq_thresh,
transform=transform,
normalization=normalization,
group=group,
pts_dim=pts_dim,
).scen
Expand Down Expand Up @@ -1485,9 +1485,9 @@ class dOTC(Adjust):
See :py:class:`xclim.sdba.processing.adapt_freq` for details.
Frequency adaptation is not applied to missing variables if is dict.
Applied to all variables if is string.
transform : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated.
Default is "max_distance".
normalization : {None, 'standardize', 'max_distance', 'max_value'}
Per-variable transformation applied before the distances are calculated
in the optimal transport. Default is "max_distance".
See :py:class:`~xclim.sdba.adjustment.OTC` for details.
group : Union[str, Grouper]
The grouping information. See :py:class:`xclim.sdba.base.Grouper` for details.
Expand Down Expand Up @@ -1552,7 +1552,7 @@ def _adjust(
jitter_inside_bins: bool = True,
kind: dict | str | None = None,
adapt_freq_thresh: dict | str | None = None,
transform: str | None = "max_distance",
normalization: str | None = "max_distance",
group: str | Grouper = "time",
pts_dim: str = "multivar",
) -> xr.DataArray:
Expand All @@ -1571,9 +1571,9 @@ def _adjust(
if cov_factor not in [None, "std", "cholesky"]:
raise ValueError("`cov_factor` should be in [None, 'std', 'cholesky'].")

if transform not in [None, "standardize", "max_distance", "max_value"]:
if normalization not in [None, "standardize", "max_distance", "max_value"]:
raise ValueError(
"`transform` should be in [None, 'standardize', 'max_distance', 'max_value']."
"`normalization` should be in [None, 'standardize', 'max_distance', 'max_value']."
)

if isinstance(adapt_freq_thresh, str):
Expand All @@ -1594,7 +1594,7 @@ def _adjust(
jitter_inside_bins=jitter_inside_bins,
kind=kind,
adapt_freq_thresh=adapt_freq_thresh,
transform=transform,
normalization=normalization,
group=group,
pts_dim=pts_dim,
).scen
Expand Down
14 changes: 7 additions & 7 deletions xclim/sdba/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,7 @@ def histogram(data, bin_width, bin_origin):
return grid, mu, idx_bin


def optimal_transport(gridX, gridY, muX, muY, numItermax, transform):
def optimal_transport(gridX, gridY, muX, muY, num_iter_max, normalization):
"""Computes the optimal transportation plan on (transformations of) X and Y.

References
Expand All @@ -1034,18 +1034,18 @@ def optimal_transport(gridX, gridY, muX, muY, numItermax, transform):
"You can install it with `pip install POT`, `conda install -c conda-forge pot` or `pip install 'xclim[extras]'`."
) from e

if transform == "standardize":
gridX = (gridX - gridX.mean()) / gridX.std()
gridY = (gridY - gridY.mean()) / gridY.std()
if normalization == "standardize":
gridX = (gridX - gridX.mean(axis=0)) / gridX.std(axis=0)
gridY = (gridY - gridY.mean(axis=0)) / gridY.std(axis=0)

elif transform == "max_distance":
elif normalization == "max_distance":
max1 = np.abs(gridX.max(axis=0) - gridY.min(axis=0))
max2 = np.abs(gridY.max(axis=0) - gridX.min(axis=0))
max_dist = np.maximum(max1, max2)
gridX = gridX / max_dist
gridY = gridY / max_dist

elif transform == "max_value":
elif normalization == "max_value":
max_value = np.maximum(gridX.max(axis=0), gridY.max(axis=0))
gridX = gridX / max_value
gridY = gridY / max_value
Expand All @@ -1054,7 +1054,7 @@ def optimal_transport(gridX, gridY, muX, muY, numItermax, transform):
C = distance.cdist(gridX, gridY, "sqeuclidean")

# Compute the optimal transportation plan
gamma = emd(muX, muY, C, numItermax=numItermax)
gamma = emd(muX, muY, C, numItermax=num_iter_max)
plan = (gamma.T / gamma.sum(axis=1)).T

return plan
Expand Down
Loading