Addressed review comments

reza-armuei · Oct 4, 2024 · 5ff5076 · 5ff5076
1 parent 872f818
commit 5ff5076
Show file tree

Hide file tree

Showing 9 changed files with 2,978 additions and 1,400 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Below is a **curated selection** of the metrics, tools and statistical tests inc
 
 |                       	| **Description** 	| **Selection of Included Functions** 	|
 |-----------------------	|-----------------	|--------------	|
-| **[Continuous](https://scores.readthedocs.io/en/stable/included.html#continuous)**        	|Scores for evaluating single-valued continuous forecasts.                  	|MAE, MSE, RMSE, Additive Bias, Multiplicative Bias, Pearson's Correlation Coefficient, Flip-Flop Index, Quantile Loss, Murphy Score, Quantile Interval Score, and threshold weighted scores for expectiles, quantiles and Huber Loss.             	|
+| **[Continuous](https://scores.readthedocs.io/en/stable/included.html#continuous)**        	|Scores for evaluating single-valued continuous forecasts.                  	|MAE, MSE, RMSE, Additive Bias, Multiplicative Bias, Pearson's Correlation Coefficient, Flip-Flop Index, Quantile Loss, Murphy Score, Quantile Interval Score, Interval Score, and threshold weighted scores for expectiles, quantiles and Huber Loss.             	|
 | **[Probability](https://scores.readthedocs.io/en/stable/included.html#probability)**        |Scores for evaluating forecasts that are expressed as predictive distributions, ensembles, and probabilities of binary events.                   |Brier Score, Continuous Ranked Probability Score (CRPS) for Cumulative Density Functions (CDF) and ensembles (including threshold weighted versions), Receiver Operating Characteristic (ROC), Isotonic Regression (reliability diagrams).               |
 | **[Categorical](https://scores.readthedocs.io/en/stable/included.html#categorical)**       	|Scores for evaluating forecasts of categories.                	|17 binary contingency table (confusion matrix) metrics and the FIxed Risk Multicategorical (FIRM) Score.               	|
 | **[Spatial](https://scores.readthedocs.io/en/stable/included.html#spatial)** 	|Scores that take into account spatial structure.                 	|Fractions Skill Score.              	|

diff --git a/docs/api.md b/docs/api.md
@@ -33,6 +33,7 @@
 .. autofunction:: scores.continuous.tw_huber_loss
 .. autofunction:: scores.continuous.tw_expectile_score
 .. autofunction:: scores.continuous.quantile_interval_score
+.. autofunction:: scores.continuous.interval_score
 ```
 
 ## scores.probability

diff --git a/docs/included.md b/docs/included.md
@@ -45,6 +45,10 @@
     [Tutorial](project:./tutorials/Flip_Flop_Index.md)
   - 
     [Griffiths et al. (2019)](https://doi.org/10.1002/met.1732); [Griffiths et al. (2021)](https://doi.org/10.1071/ES21010)
+* - Interval Score
+  - [API](api.md#scores.continuous.interval_score)
+  - [Tutorial](project:./tutorials/Quantile_Interval_And_Interval_Scores.md)
+  - [Gneiting and Raftery (2007) - Corollary 5.2](https://doi.org/10.1198/016214506000001437)
 * - Isotonic Fit, *see Isotonic Regression*
   - &mdash;
   - &mdash;
@@ -105,13 +109,10 @@
   - &mdash;
   - &mdash;
   - &mdash;
-* - Quantile Interval Score (Interval Score)
+* - Quantile Interval Score
   - [API](api.md#scores.continuous.quantile_interval_score)
-  - [Tutorial](project:./tutorials/Quantile_Interval_Score.md)
+  - [Tutorial](project:./tutorials/Quantile_Interval_And_Interval_Scores.md)
   - [Gneiting and Raftery (2007) - Corollary 5.2](https://doi.org/10.1198/016214506000001437)
-  - &mdash;
-  - &mdash;
-  - &mdash;
 * - Quantile Loss (Quantile Score, Pinball Loss)
   - [API](api.md#scores.continuous.quantile_score)
   - [Tutorial](project:./tutorials/Quantile_Loss.md)

diff --git a/src/scores/continuous/__init__.py b/src/scores/continuous/__init__.py
@@ -12,7 +12,7 @@
     flip_flop_index,
     flip_flop_index_proportion_exceeding,
 )
-from scores.continuous.interval_impl import quantile_interval_score
+from scores.continuous.interval_impl import interval_score, quantile_interval_score
 from scores.continuous.murphy_impl import murphy_score, murphy_thetas
 from scores.continuous.quantile_loss_impl import quantile_score
 from scores.continuous.standard_impl import (
@@ -57,4 +57,5 @@
     "tw_quantile_score",
     "tw_squared_error",
     "quantile_interval_score",
+    "interval_score",
 ]
diff --git a/src/scores/continuous/interval_impl.py b/src/scores/continuous/interval_impl.py
@@ -1,5 +1,5 @@
 """
-Implementation of quantile interval score
+Implementation of quantile interval and interval scores
 """
 from typing import Optional
 
@@ -25,6 +25,18 @@ def quantile_interval_score(  # pylint: disable=R0914
     Calculates the qauntile interval score for interval forecasts. This score penalizes
     the interval's width and whether the observation value lies outside the interval.
 
+    .. math::
+    \\text{quantile interval score} = (q_{u} - q_{l}) +
+    \\frac{1}{\\alpha_l} \\cdot (q_{l} - y) \\cdot \\mathbb{1}(y < q_{l}) + \\frac{1}{1 - \\alpha_u} \\cdot (q_{u} - y) \\cdot \\mathbb{1}(y < q_{u})
+
+    where
+        - :math: `q_{u}` is the forecast at upper quantile
+        - :math: `q_{l}` is the forecast at lower quantile
+        - :math: `\\alpha_u` is upper quantile level
+        - :math: `\\alpha_l` is lower quantile level
+        - :math: `y` is the observation
+        - :math: `\\mathbb{1}(condition)` is an indicator function that is 1 when the condition is true, and 0 otherwise.
+
     Args:
         fcst_lower_qtile: array of forecast values at the lower quantile.
         fcst_upper_qtile: array of forecast values at the upper quantile.
@@ -61,9 +73,21 @@ def quantile_interval_score(  # pylint: disable=R0914
         ValueError: If (fcst_lower_qtile > fcst_upper_qtile).any().
 
     References:
-        T. Gneiting, and Raftery A.E., "Strictly proper scoring rules, prediction and
-        estimation", J. Amer. Stat. Assoc., Vol. 102 No. 477 (2007), pp. 359--378,
-        Corollary 5.2.
+        Gneiting, T., & Raftery, A. E. (2007). Strictly proper scoring rules, prediction,
+        and estimation. Journal of the American Statistical Association, 102(477), 359-378.
+        Corollary 5.2. https://doi.org/10.1037/ppm0000185
+
+    Examples:
+        Calculate the quantile interval score for forecast intervals with lower and upper
+        quantile levels of 0.1 and 0.6, respectively.
+
+        >>> import numpy as np
+        >>> import xarray as xr
+        >>> from scores.continuous import quantile_interval_score
+        >>> fcst_lower_level = xr.DataArray(np.random.uniform(10, 15, size=(30, 15)), dims=['time', 'station'])
+        >>> fcst_upper_level = xr.DataArray(np.random.uniform(20, 25, size=(30, 15)), dims=['time', 'station'])
+        >>> obs = xr.DataArray(np.random.uniform(8, 27,size=(30, 15)), dims=['time', 'station'])
+        >>> quantile_interval_score(fcst_lower_level, fcst_upper_level, obs, 0.1, 0.6)
     """
 
     if not 0 < lower_qtile_level < upper_qtile_level < 1:
@@ -87,14 +111,116 @@ def quantile_interval_score(  # pylint: disable=R0914
     upper_diff = (1 / (1 - upper_qtile_level)) * (obs - fcst_upper_qtile)
     upper_diff = upper_diff.clip(min=0.0)
     total_score = interval_width + lower_diff + upper_diff
-    compnents = {
+    components = {
         "interval_width_penalty": interval_width,
         "overprediction_penalty": lower_diff,
         "underprediction_penalty": upper_diff,
         "total": total_score,
     }
-    result = xr.Dataset(compnents)
+    result = xr.Dataset(components)
     reduce_dims = gather_dimensions(fcst_lower_qtile.dims, obs.dims, reduce_dims=reduce_dims, preserve_dims=preserve_dims)  # type: ignore[assignment]
     results = apply_weights(result, weights=weights)
     score = results.mean(dim=reduce_dims)
     return score
+
+
+def interval_score(
+    fcst_lower_qtile: xr.DataArray,
+    fcst_upper_qtile: xr.DataArray,
+    obs: xr.DataArray,
+    interval_range: float,
+    *,  # Force keywords arguments to be keyword-only
+    reduce_dims: Optional[FlexibleDimensionTypes] = None,
+    preserve_dims: Optional[FlexibleDimensionTypes] = None,
+    weights: Optional[xr.DataArray] = None,
+) -> xr.Dataset:
+    """
+    Calculates the interval score for interval forecasts.
+    This function calls the `quantile_interval_score` function to calculate the interval score
+    for cases where quantile level range is symmetric.
+
+    .. math::
+    \\text{interval score} = (q_{u} - q_{l}) +
+    \\frac{2}{\\alpha} \\cdot (q_{l} - y) \\cdot \\mathbb{1}(y < q_{l}) + \\frac{2}{\\alpha} \\cdot (q_{u} - y) \\cdot \\mathbb{1}(y < q_{u})
+
+    where
+        - :math: `q_{u}` is the forecast at upper quantile
+        - :math: `q_{l}` is the forecast at lower quantile
+        - :math: `\\alpha` is confidence level that is equal to :math:`1 - interval_range`
+        - :math: `y` is the observation
+        - :math: `\\mathbb{1}(condition)` is an indicator function that is 1 when the condition is true, and 0 otherwise.
+
+
+    Args:
+        fcst_lower_qtile: array of forecast values at the lower quantile.
+        fcst_upper_qtile: array of forecast values at the upper quantile.
+        obs: array of observations
+        interval_range: Range (length) of interval (e.g., 0.9 for 90% confidence level, which
+            will result in lower quantile of 0.05 and upper quantile of 0.95). Must be strictly
+            between 0 and 1.
+        reduce_dims: Optionally specify which dimensions to reduce when
+            calculating the interval score. All other dimensions will be preserved.
+            As a special case, 'all' will allow all dimensions to be reduced. Only one
+            of `reduce_dims` and `preserve_dims` can be supplied. The default behaviour
+            if neither are supplied is to reduce all dims.
+        preserve_dims: Optionally specify which dimensions to preserve when calculating
+            interval score. All other dimensions will be reduced. As a special case,
+            'all' will allow all dimensions to be preserved. In this case, the result will be in
+            the same shape/dimensionality as the forecast, and the errors will be the quantile
+            score at each point (i.e. single-value comparison against observed), and the
+            forecast and observed dimensions must match precisely. Only one of `reduce_dims`
+            and `preserve_dims` can be supplied. The default behaviour if neither are supplied
+            is to reduce all dims.
+        weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude,
+            by population, custom)
+
+    Returns:
+        A Dataset with the dimensions specified in `dims`.
+        The dataset will have the following data variables:
+
+        - interval_width_penalty: fcst_upper_qtile - fcst_lower_qtile
+        - overprediction_penalty: ((2/lower_qtile_level)*(fcst_lower_qtile-obs)).clip(min=0)
+        - underprediction_penalty: ((1/upper_qtile_level)*(fcst_upper_qtile-obs)).clip(min=0)
+        - total: sum of all penalties
+
+        As can bee seen in interval score equation, the lower and upper quantile levels are
+        derived from the interval range: `lower_qtile_level = (1 - interval_range) / 2`
+        and `upper_qtile_level = 1 + interval_range) / 2`.
+
+    Raises:
+        ValueError: If not 0 < interval_range < 1.
+        ValueError: If (fcst_lower_qtile > fcst_upper_qtile).any().
+
+    References:
+        Gneiting, T., & Raftery, A. E. (2007). Strictly proper scoring rules, prediction,
+        and estimation. Journal of the American Statistical Association, 102(477), 359-378.
+        Corollary 5.2. https://doi.org/10.1037/ppm0000185
+
+     See also:
+        :py:func:`scores.continuous.quantile_interval_score`
+
+    Examples:
+        Calculate the interval score for forecast intervals with interval range of 0.5
+        (i.e., lower and upper quantile levels are 0.25 and 0.75, respectively).
+
+        >>> import numpy as np
+        >>> import xarray as xr
+        >>> from scores.continuous import interval_score
+        >>> fcst_lower_level = xr.DataArray(np.random.uniform(10, 15, size=(30, 15)), dims=['time', 'station'])
+        >>> fcst_upper_level = xr.DataArray(np.random.uniform(20, 25, size=(30, 15)), dims=['time', 'station'])
+        >>> obs = xr.DataArray(np.random.uniform(8, 27,size=(30, 15)), dims=['time', 'station'])
+        >>> interval_score(fcst_lower_level, fcst_upper_level, obs, 0.5)
+    """
+    if interval_range <= 0 or interval_range >= 1:
+        raise ValueError("`interval_range` must be strictly between 0 and 1")
+    score = quantile_interval_score(
+        fcst_lower_qtile=fcst_lower_qtile,
+        fcst_upper_qtile=fcst_upper_qtile,
+        obs=obs,
+        lower_qtile_level=(1 - interval_range) / 2,
+        upper_qtile_level=(1 + interval_range) / 2,
+        reduce_dims=reduce_dims,
+        preserve_dims=preserve_dims,
+        weights=weights,
+    )
+    return score
diff --git a/tests/continuous/quantile_interval_score_test_data.py b/tests/continuous/quantile_interval_score_test_data.py
@@ -1,5 +1,6 @@
 """
-Generation of the test data used to test scores.continuous.quantile_interval_score
+Generation of the test data used to test scores.continuous.quantile_interval_score and
+scores.continuous.interval_score
 """
 
 import numpy as np
@@ -10,6 +11,7 @@
     " {} and upper_qtile_level = {}"
 )
 ERROR_MESSAGE_FCST_COND = "Input does not satisfy fcst_lower_qtile < fcst_upper_qtile condition."
+ERROR_MESSAGE_INTERVAL_RANGE = "`interval_range` must be strictly between 0 and 1"
 FCST_LOWER = xr.DataArray([10, 15, 20], dims="time", coords={"time": np.arange(3)})
 FCST_UPPER = xr.DataArray([20, 25, 30], dims="time", coords={"time": np.arange(3)})
 FCST_UPPER_STATION = FCST_UPPER.rename({"time": "station"})
@@ -105,3 +107,30 @@
     },
     coords={"station": np.arange(3), "time": np.arange(2)},
 )
+EXPECTED_2D_INTERVAL = xr.Dataset(
+    data_vars={
+        "interval_width_penalty": (["time", "station"], [[10, 10, 10], [1, 2, 4]]),
+        "overprediction_penalty": (["time", "station"], [[0, 4, 0], [4, 0, 0]]),
+        "underprediction_penalty": (["time", "station"], [[0, 0, 8], [0, 0, 8]]),
+        "total": (["time", "station"], [[10, 14, 18], [5, 2, 12]]),
+    },
+    coords={"station": np.arange(3), "time": np.arange(2)},
+)
+EXPECTED_2D_WITHOUT_TIME_INTERVAL = xr.Dataset(
+    data_vars={
+        "interval_width_penalty": ("station", [5.5, 6, 7]),
+        "overprediction_penalty": ("station", [1.66666667, 1.66666667, 0]),
+        "underprediction_penalty": ("station", [0, 0, 6.66666667]),
+        "total": ("station", [7.16666667, 7.66666667, 13.66666667]),
+    },
+    coords={"station": np.arange(3)},
+)
+EXPECTED_2D_WITH_WEIGHTS_INTERVAL = xr.Dataset(
+    data_vars={
+        "interval_width_penalty": (["time", "station"], [[0, 10, 10], [1, 2, 0]]),
+        "overprediction_penalty": (["time", "station"], [[0, 3.33333333, 0], [3.33333333, 0, 0]]),
+        "underprediction_penalty": (["time", "station"], [[0, 0, 6.66666667], [0, 0, 0]]),
+        "total": (["time", "station"], [[0, 13.33333333, 16.66666667], [4.33333333, 2, 0]]),
+    },
+    coords={"station": np.arange(3), "time": np.arange(2)},
+)
diff --git a/tests/continuous/test_quantile_interval.py b/tests/continuous/test_quantile_interval.py
@@ -1,5 +1,6 @@
 """
-Contains unit tests for scores.probability.continuous.quantile_interval_score
+Contains unit tests for scores.probability.continuous.quantile_interval_score and
+scores.probability.continuous.interval_score
 """
 
 try:
@@ -11,7 +12,7 @@
 import pytest
 import xarray as xr
 
-from scores.continuous import quantile_interval_score
+from scores.continuous import interval_score, quantile_interval_score
 from scores.utils import DimensionError
 from tests.continuous import quantile_interval_score_test_data as qistd
 
@@ -183,3 +184,72 @@ def test_quantile_interval_score_dask():
         assert isinstance(result[var].data, dask.array.Array)
     result = result.compute()
     xr.testing.assert_allclose(result, qistd.EXPECTED_2D)
+
+
+@pytest.mark.parametrize(
+    ("interval_range", "lower_fcst", "upper_fcst", "error_message"),
+    [
+        (1, qistd.FCST_LOWER, qistd.FCST_UPPER, qistd.ERROR_MESSAGE_INTERVAL_RANGE),
+        (-0.1, qistd.FCST_LOWER, qistd.FCST_UPPER, qistd.ERROR_MESSAGE_INTERVAL_RANGE),
+        (0.5, qistd.FCST_UPPER, qistd.FCST_LOWER, qistd.ERROR_MESSAGE_FCST_COND),
+    ],
+)
+def test_is_value_errors(interval_range, lower_fcst, upper_fcst, error_message):
+    """interval_score raises ValueError."""
+    with pytest.raises(ValueError, match=error_message):
+        interval_score(
+            fcst_lower_qtile=lower_fcst,
+            fcst_upper_qtile=upper_fcst,
+            obs=qistd.OBS,
+            interval_range=interval_range,
+        )
+
+
+@pytest.mark.parametrize(
+    ("lower_fcst", "upper_fcst", "obs", "interval_range", "preserve_dims", "reduce_dims", "weights", "expected"),
+    [
+        (
+            qistd.FCST_LOWER_2D,
+            qistd.FCST_UPPER_2D,
+            qistd.OBS_2D,
+            0.5,
+            ["time", "station"],
+            None,
+            None,
+            qistd.EXPECTED_2D_INTERVAL,
+        ),
+        (
+            qistd.FCST_LOWER_2D,
+            qistd.FCST_UPPER_2D,
+            qistd.OBS_2D,
+            0.4,
+            None,
+            ["time"],
+            None,
+            qistd.EXPECTED_2D_WITHOUT_TIME_INTERVAL,
+        ),
+        # To test weight
+        (
+            qistd.FCST_LOWER_2D,
+            qistd.FCST_UPPER_2D,
+            qistd.OBS_2D,
+            0.4,
+            ["time", "station"],
+            None,
+            qistd.WEIGHTS,
+            qistd.EXPECTED_2D_WITH_WEIGHTS_INTERVAL,
+        ),
+    ],
+)
+def test_is_calculations(lower_fcst, upper_fcst, obs, interval_range, preserve_dims, reduce_dims, weights, expected):
+    """interval_score returns the expected object."""
+    result = interval_score(
+        obs=obs,
+        fcst_lower_qtile=lower_fcst,
+        fcst_upper_qtile=upper_fcst,
+        interval_range=interval_range,
+        reduce_dims=reduce_dims,
+        preserve_dims=preserve_dims,
+        weights=weights,
+    )
+    xr.testing.assert_allclose(result, expected)