Skip to content

Commit

Permalink
Addressed review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
reza-armuei committed Oct 4, 2024
1 parent 872f818 commit 5ff5076
Show file tree
Hide file tree
Showing 9 changed files with 2,978 additions and 1,400 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Below is a **curated selection** of the metrics, tools and statistical tests inc

| | **Description** | **Selection of Included Functions** |
|----------------------- |----------------- |-------------- |
| **[Continuous](https://scores.readthedocs.io/en/stable/included.html#continuous)** |Scores for evaluating single-valued continuous forecasts. |MAE, MSE, RMSE, Additive Bias, Multiplicative Bias, Pearson's Correlation Coefficient, Flip-Flop Index, Quantile Loss, Murphy Score, Quantile Interval Score, and threshold weighted scores for expectiles, quantiles and Huber Loss. |
| **[Continuous](https://scores.readthedocs.io/en/stable/included.html#continuous)** |Scores for evaluating single-valued continuous forecasts. |MAE, MSE, RMSE, Additive Bias, Multiplicative Bias, Pearson's Correlation Coefficient, Flip-Flop Index, Quantile Loss, Murphy Score, Quantile Interval Score, Interval Score, and threshold weighted scores for expectiles, quantiles and Huber Loss. |
| **[Probability](https://scores.readthedocs.io/en/stable/included.html#probability)** |Scores for evaluating forecasts that are expressed as predictive distributions, ensembles, and probabilities of binary events. |Brier Score, Continuous Ranked Probability Score (CRPS) for Cumulative Density Functions (CDF) and ensembles (including threshold weighted versions), Receiver Operating Characteristic (ROC), Isotonic Regression (reliability diagrams). |
| **[Categorical](https://scores.readthedocs.io/en/stable/included.html#categorical)** |Scores for evaluating forecasts of categories. |17 binary contingency table (confusion matrix) metrics and the FIxed Risk Multicategorical (FIRM) Score. |
| **[Spatial](https://scores.readthedocs.io/en/stable/included.html#spatial)** |Scores that take into account spatial structure. |Fractions Skill Score. |
Expand Down
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
.. autofunction:: scores.continuous.tw_huber_loss
.. autofunction:: scores.continuous.tw_expectile_score
.. autofunction:: scores.continuous.quantile_interval_score
.. autofunction:: scores.continuous.interval_score
```

## scores.probability
Expand Down
11 changes: 6 additions & 5 deletions docs/included.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
[Tutorial](project:./tutorials/Flip_Flop_Index.md)
-
[Griffiths et al. (2019)](https://doi.org/10.1002/met.1732); [Griffiths et al. (2021)](https://doi.org/10.1071/ES21010)
* - Interval Score
- [API](api.md#scores.continuous.interval_score)
- [Tutorial](project:./tutorials/Quantile_Interval_And_Interval_Scores.md)
- [Gneiting and Raftery (2007) - Corollary 5.2](https://doi.org/10.1198/016214506000001437)
* - Isotonic Fit, *see Isotonic Regression*
- —
- —
Expand Down Expand Up @@ -105,13 +109,10 @@
- —
- —
- —
* - Quantile Interval Score (Interval Score)
* - Quantile Interval Score
- [API](api.md#scores.continuous.quantile_interval_score)
- [Tutorial](project:./tutorials/Quantile_Interval_Score.md)
- [Tutorial](project:./tutorials/Quantile_Interval_And_Interval_Scores.md)
- [Gneiting and Raftery (2007) - Corollary 5.2](https://doi.org/10.1198/016214506000001437)
- —
- —
- —
* - Quantile Loss (Quantile Score, Pinball Loss)
- [API](api.md#scores.continuous.quantile_score)
- [Tutorial](project:./tutorials/Quantile_Loss.md)
Expand Down
3 changes: 2 additions & 1 deletion src/scores/continuous/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
flip_flop_index,
flip_flop_index_proportion_exceeding,
)
from scores.continuous.interval_impl import quantile_interval_score
from scores.continuous.interval_impl import interval_score, quantile_interval_score
from scores.continuous.murphy_impl import murphy_score, murphy_thetas
from scores.continuous.quantile_loss_impl import quantile_score
from scores.continuous.standard_impl import (
Expand Down Expand Up @@ -57,4 +57,5 @@
"tw_quantile_score",
"tw_squared_error",
"quantile_interval_score",
"interval_score",
]
138 changes: 132 additions & 6 deletions src/scores/continuous/interval_impl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Implementation of quantile interval score
Implementation of quantile interval and interval scores
"""
from typing import Optional

Expand All @@ -25,6 +25,18 @@ def quantile_interval_score( # pylint: disable=R0914
Calculates the qauntile interval score for interval forecasts. This score penalizes
the interval's width and whether the observation value lies outside the interval.
.. math::
\\text{quantile interval score} = (q_{u} - q_{l}) +
\\frac{1}{\\alpha_l} \\cdot (q_{l} - y) \\cdot \\mathbb{1}(y < q_{l}) + \\frac{1}{1 - \\alpha_u} \\cdot (q_{u} - y) \\cdot \\mathbb{1}(y < q_{u})
where
- :math: `q_{u}` is the forecast at upper quantile
- :math: `q_{l}` is the forecast at lower quantile
- :math: `\\alpha_u` is upper quantile level
- :math: `\\alpha_l` is lower quantile level
- :math: `y` is the observation
- :math: `\\mathbb{1}(condition)` is an indicator function that is 1 when the condition is true, and 0 otherwise.
Args:
fcst_lower_qtile: array of forecast values at the lower quantile.
fcst_upper_qtile: array of forecast values at the upper quantile.
Expand Down Expand Up @@ -61,9 +73,21 @@ def quantile_interval_score( # pylint: disable=R0914
ValueError: If (fcst_lower_qtile > fcst_upper_qtile).any().
References:
T. Gneiting, and Raftery A.E., "Strictly proper scoring rules, prediction and
estimation", J. Amer. Stat. Assoc., Vol. 102 No. 477 (2007), pp. 359--378,
Corollary 5.2.
Gneiting, T., & Raftery, A. E. (2007). Strictly proper scoring rules, prediction,
and estimation. Journal of the American Statistical Association, 102(477), 359-378.
Corollary 5.2. https://doi.org/10.1037/ppm0000185
Examples:
Calculate the quantile interval score for forecast intervals with lower and upper
quantile levels of 0.1 and 0.6, respectively.
>>> import numpy as np
>>> import xarray as xr
>>> from scores.continuous import quantile_interval_score
>>> fcst_lower_level = xr.DataArray(np.random.uniform(10, 15, size=(30, 15)), dims=['time', 'station'])
>>> fcst_upper_level = xr.DataArray(np.random.uniform(20, 25, size=(30, 15)), dims=['time', 'station'])
>>> obs = xr.DataArray(np.random.uniform(8, 27,size=(30, 15)), dims=['time', 'station'])
>>> quantile_interval_score(fcst_lower_level, fcst_upper_level, obs, 0.1, 0.6)
"""

if not 0 < lower_qtile_level < upper_qtile_level < 1:
Expand All @@ -87,14 +111,116 @@ def quantile_interval_score( # pylint: disable=R0914
upper_diff = (1 / (1 - upper_qtile_level)) * (obs - fcst_upper_qtile)
upper_diff = upper_diff.clip(min=0.0)
total_score = interval_width + lower_diff + upper_diff
compnents = {
components = {
"interval_width_penalty": interval_width,
"overprediction_penalty": lower_diff,
"underprediction_penalty": upper_diff,
"total": total_score,
}
result = xr.Dataset(compnents)
result = xr.Dataset(components)
reduce_dims = gather_dimensions(fcst_lower_qtile.dims, obs.dims, reduce_dims=reduce_dims, preserve_dims=preserve_dims) # type: ignore[assignment]
results = apply_weights(result, weights=weights)
score = results.mean(dim=reduce_dims)
return score


def interval_score(
fcst_lower_qtile: xr.DataArray,
fcst_upper_qtile: xr.DataArray,
obs: xr.DataArray,
interval_range: float,
*, # Force keywords arguments to be keyword-only
reduce_dims: Optional[FlexibleDimensionTypes] = None,
preserve_dims: Optional[FlexibleDimensionTypes] = None,
weights: Optional[xr.DataArray] = None,
) -> xr.Dataset:
"""
Calculates the interval score for interval forecasts.
This function calls the `quantile_interval_score` function to calculate the interval score
for cases where quantile level range is symmetric.
.. math::
\\text{interval score} = (q_{u} - q_{l}) +
\\frac{2}{\\alpha} \\cdot (q_{l} - y) \\cdot \\mathbb{1}(y < q_{l}) + \\frac{2}{\\alpha} \\cdot (q_{u} - y) \\cdot \\mathbb{1}(y < q_{u})
where
- :math: `q_{u}` is the forecast at upper quantile
- :math: `q_{l}` is the forecast at lower quantile
- :math: `\\alpha` is confidence level that is equal to :math:`1 - interval_range`
- :math: `y` is the observation
- :math: `\\mathbb{1}(condition)` is an indicator function that is 1 when the condition is true, and 0 otherwise.
Args:
fcst_lower_qtile: array of forecast values at the lower quantile.
fcst_upper_qtile: array of forecast values at the upper quantile.
obs: array of observations
interval_range: Range (length) of interval (e.g., 0.9 for 90% confidence level, which
will result in lower quantile of 0.05 and upper quantile of 0.95). Must be strictly
between 0 and 1.
reduce_dims: Optionally specify which dimensions to reduce when
calculating the interval score. All other dimensions will be preserved.
As a special case, 'all' will allow all dimensions to be reduced. Only one
of `reduce_dims` and `preserve_dims` can be supplied. The default behaviour
if neither are supplied is to reduce all dims.
preserve_dims: Optionally specify which dimensions to preserve when calculating
interval score. All other dimensions will be reduced. As a special case,
'all' will allow all dimensions to be preserved. In this case, the result will be in
the same shape/dimensionality as the forecast, and the errors will be the quantile
score at each point (i.e. single-value comparison against observed), and the
forecast and observed dimensions must match precisely. Only one of `reduce_dims`
and `preserve_dims` can be supplied. The default behaviour if neither are supplied
is to reduce all dims.
weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude,
by population, custom)
Returns:
A Dataset with the dimensions specified in `dims`.
The dataset will have the following data variables:
- interval_width_penalty: fcst_upper_qtile - fcst_lower_qtile
- overprediction_penalty: ((2/lower_qtile_level)*(fcst_lower_qtile-obs)).clip(min=0)
- underprediction_penalty: ((1/upper_qtile_level)*(fcst_upper_qtile-obs)).clip(min=0)
- total: sum of all penalties
As can bee seen in interval score equation, the lower and upper quantile levels are
derived from the interval range: `lower_qtile_level = (1 - interval_range) / 2`
and `upper_qtile_level = 1 + interval_range) / 2`.
Raises:
ValueError: If not 0 < interval_range < 1.
ValueError: If (fcst_lower_qtile > fcst_upper_qtile).any().
References:
Gneiting, T., & Raftery, A. E. (2007). Strictly proper scoring rules, prediction,
and estimation. Journal of the American Statistical Association, 102(477), 359-378.
Corollary 5.2. https://doi.org/10.1037/ppm0000185
See also:
:py:func:`scores.continuous.quantile_interval_score`
Examples:
Calculate the interval score for forecast intervals with interval range of 0.5
(i.e., lower and upper quantile levels are 0.25 and 0.75, respectively).
>>> import numpy as np
>>> import xarray as xr
>>> from scores.continuous import interval_score
>>> fcst_lower_level = xr.DataArray(np.random.uniform(10, 15, size=(30, 15)), dims=['time', 'station'])
>>> fcst_upper_level = xr.DataArray(np.random.uniform(20, 25, size=(30, 15)), dims=['time', 'station'])
>>> obs = xr.DataArray(np.random.uniform(8, 27,size=(30, 15)), dims=['time', 'station'])
>>> interval_score(fcst_lower_level, fcst_upper_level, obs, 0.5)
"""
if interval_range <= 0 or interval_range >= 1:
raise ValueError("`interval_range` must be strictly between 0 and 1")
score = quantile_interval_score(
fcst_lower_qtile=fcst_lower_qtile,
fcst_upper_qtile=fcst_upper_qtile,
obs=obs,
lower_qtile_level=(1 - interval_range) / 2,
upper_qtile_level=(1 + interval_range) / 2,
reduce_dims=reduce_dims,
preserve_dims=preserve_dims,
weights=weights,
)
return score
31 changes: 30 additions & 1 deletion tests/continuous/quantile_interval_score_test_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
Generation of the test data used to test scores.continuous.quantile_interval_score
Generation of the test data used to test scores.continuous.quantile_interval_score and
scores.continuous.interval_score
"""

import numpy as np
Expand All @@ -10,6 +11,7 @@
" {} and upper_qtile_level = {}"
)
ERROR_MESSAGE_FCST_COND = "Input does not satisfy fcst_lower_qtile < fcst_upper_qtile condition."
ERROR_MESSAGE_INTERVAL_RANGE = "`interval_range` must be strictly between 0 and 1"
FCST_LOWER = xr.DataArray([10, 15, 20], dims="time", coords={"time": np.arange(3)})
FCST_UPPER = xr.DataArray([20, 25, 30], dims="time", coords={"time": np.arange(3)})
FCST_UPPER_STATION = FCST_UPPER.rename({"time": "station"})
Expand Down Expand Up @@ -105,3 +107,30 @@
},
coords={"station": np.arange(3), "time": np.arange(2)},
)
EXPECTED_2D_INTERVAL = xr.Dataset(
data_vars={
"interval_width_penalty": (["time", "station"], [[10, 10, 10], [1, 2, 4]]),
"overprediction_penalty": (["time", "station"], [[0, 4, 0], [4, 0, 0]]),
"underprediction_penalty": (["time", "station"], [[0, 0, 8], [0, 0, 8]]),
"total": (["time", "station"], [[10, 14, 18], [5, 2, 12]]),
},
coords={"station": np.arange(3), "time": np.arange(2)},
)
EXPECTED_2D_WITHOUT_TIME_INTERVAL = xr.Dataset(
data_vars={
"interval_width_penalty": ("station", [5.5, 6, 7]),
"overprediction_penalty": ("station", [1.66666667, 1.66666667, 0]),
"underprediction_penalty": ("station", [0, 0, 6.66666667]),
"total": ("station", [7.16666667, 7.66666667, 13.66666667]),
},
coords={"station": np.arange(3)},
)
EXPECTED_2D_WITH_WEIGHTS_INTERVAL = xr.Dataset(
data_vars={
"interval_width_penalty": (["time", "station"], [[0, 10, 10], [1, 2, 0]]),
"overprediction_penalty": (["time", "station"], [[0, 3.33333333, 0], [3.33333333, 0, 0]]),
"underprediction_penalty": (["time", "station"], [[0, 0, 6.66666667], [0, 0, 0]]),
"total": (["time", "station"], [[0, 13.33333333, 16.66666667], [4.33333333, 2, 0]]),
},
coords={"station": np.arange(3), "time": np.arange(2)},
)
74 changes: 72 additions & 2 deletions tests/continuous/test_quantile_interval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
Contains unit tests for scores.probability.continuous.quantile_interval_score
Contains unit tests for scores.probability.continuous.quantile_interval_score and
scores.probability.continuous.interval_score
"""

try:
Expand All @@ -11,7 +12,7 @@
import pytest
import xarray as xr

from scores.continuous import quantile_interval_score
from scores.continuous import interval_score, quantile_interval_score
from scores.utils import DimensionError
from tests.continuous import quantile_interval_score_test_data as qistd

Expand Down Expand Up @@ -183,3 +184,72 @@ def test_quantile_interval_score_dask():
assert isinstance(result[var].data, dask.array.Array)
result = result.compute()
xr.testing.assert_allclose(result, qistd.EXPECTED_2D)


@pytest.mark.parametrize(
("interval_range", "lower_fcst", "upper_fcst", "error_message"),
[
(1, qistd.FCST_LOWER, qistd.FCST_UPPER, qistd.ERROR_MESSAGE_INTERVAL_RANGE),
(-0.1, qistd.FCST_LOWER, qistd.FCST_UPPER, qistd.ERROR_MESSAGE_INTERVAL_RANGE),
(0.5, qistd.FCST_UPPER, qistd.FCST_LOWER, qistd.ERROR_MESSAGE_FCST_COND),
],
)
def test_is_value_errors(interval_range, lower_fcst, upper_fcst, error_message):
"""interval_score raises ValueError."""
with pytest.raises(ValueError, match=error_message):
interval_score(
fcst_lower_qtile=lower_fcst,
fcst_upper_qtile=upper_fcst,
obs=qistd.OBS,
interval_range=interval_range,
)


@pytest.mark.parametrize(
("lower_fcst", "upper_fcst", "obs", "interval_range", "preserve_dims", "reduce_dims", "weights", "expected"),
[
(
qistd.FCST_LOWER_2D,
qistd.FCST_UPPER_2D,
qistd.OBS_2D,
0.5,
["time", "station"],
None,
None,
qistd.EXPECTED_2D_INTERVAL,
),
(
qistd.FCST_LOWER_2D,
qistd.FCST_UPPER_2D,
qistd.OBS_2D,
0.4,
None,
["time"],
None,
qistd.EXPECTED_2D_WITHOUT_TIME_INTERVAL,
),
# To test weight
(
qistd.FCST_LOWER_2D,
qistd.FCST_UPPER_2D,
qistd.OBS_2D,
0.4,
["time", "station"],
None,
qistd.WEIGHTS,
qistd.EXPECTED_2D_WITH_WEIGHTS_INTERVAL,
),
],
)
def test_is_calculations(lower_fcst, upper_fcst, obs, interval_range, preserve_dims, reduce_dims, weights, expected):
"""interval_score returns the expected object."""
result = interval_score(
obs=obs,
fcst_lower_qtile=lower_fcst,
fcst_upper_qtile=upper_fcst,
interval_range=interval_range,
reduce_dims=reduce_dims,
preserve_dims=preserve_dims,
weights=weights,
)
xr.testing.assert_allclose(result, expected)
Loading

0 comments on commit 5ff5076

Please sign in to comment.