From 6e540dabc957f4eb56fdd00b94a25a9eac5258ad Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Thu, 18 Jan 2024 13:02:56 +0000
Subject: [PATCH] optimise metrics creation, support additional temporal
 metrics, expose percentile interpolation

---
 py-polars/polars/dataframe/frame.py           | 219 ++++++++++--------
 py-polars/polars/series/series.py             |   6 +-
 .../tests/unit/dataframe/test_describe.py     |  38 +--
 py-polars/tests/unit/series/test_describe.py  |   4 +
 4 files changed, 160 insertions(+), 107 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 78077071565a4..af9655d5e7bc2 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -7,6 +7,7 @@
 import warnings
 from collections import OrderedDict, defaultdict
 from collections.abc import Sized
+from functools import lru_cache
 from io import BytesIO, StringIO, TextIOWrapper
 from operator import itemgetter
 from pathlib import Path
@@ -4352,7 +4353,10 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
         return None
 
     def describe(
-        self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
+        self,
+        percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
+        *,
+        interpolation: RollingInterpolationMethod = "nearest",
     ) -> Self:
         """
         Summary statistics for a DataFrame.
@@ -4363,15 +4367,17 @@ def describe(
             One or more percentiles to include in the summary statistics.
             All values must be in the range `[0, 1]`.
 
+        interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
+            Interpolation method used when calculating percentiles.
+
         Notes
         -----
         The median is included by default as the 50% percentile.
 
         Warnings
         --------
-        We will never guarantee the output of describe to be stable.
-        It will show statistics that we deem informative and may
-        be updated in the future.
+        We do not guarantee the output of `describe` to be stable. It will show
+        statistics that we deem informative, and may be updated in the future.
 
         See Also
         --------
@@ -4379,114 +4385,141 @@ def describe(
 
         Examples
         --------
-        >>> from datetime import date
+        >>> from datetime import date, time
         >>> df = pl.DataFrame(
         ...     {
         ...         "float": [1.0, 2.8, 3.0],
-        ...         "int": [4, 5, None],
+        ...         "int": [40, 50, None],
         ...         "bool": [True, False, True],
-        ...         "str": [None, "b", "c"],
-        ...         "str2": ["usd", "eur", None],
+        ...         "str": ["zz", "xx", "yy"],
         ...         "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
+        ...         "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)],
         ...     }
         ... )
+
+        Show default frame statistics:
+
         >>> df.describe()
         shape: (9, 7)
-        ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
-        │ describe   ┆ float    ┆ int      ┆ bool     ┆ str  ┆ str2 ┆ date       │
-        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
-        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  ┆ str        │
-        ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
-        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 2    ┆ 2    ┆ 3          │
-        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    ┆ 0          │
-        │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
-        │ std        ┆ 1.101514 ┆ 0.707107 ┆ null     ┆ null ┆ null ┆ null       │
-        │ min        ┆ 1.0      ┆ 4.0      ┆ null     ┆ b    ┆ eur  ┆ 2020-01-01 │
-        │ 25%        ┆ 2.8      ┆ 4.0      ┆ null     ┆ null ┆ null ┆ null       │
-        │ 50%        ┆ 2.8      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ 2021-07-05 │
-        │ 75%        ┆ 3.0      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ null       │
-        │ max        ┆ 3.0      ┆ 5.0      ┆ null     ┆ c    ┆ usd  ┆ 2022-12-31 │
-        └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
+        ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐
+        │ statistic  ┆ float    ┆ int      ┆ bool     ┆ str  ┆ date       ┆ time     │
+        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---        ┆ ---      │
+        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str        ┆ str      │
+        ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 3    ┆ 3          ┆ 3        │
+        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 0    ┆ 0          ┆ 0        │
+        │ mean       ┆ 2.266667 ┆ 45.0     ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │
+        │ std        ┆ 1.101514 ┆ 7.071068 ┆ null     ┆ null ┆ null       ┆ null     │
+        │ min        ┆ 1.0      ┆ 40.0     ┆ 0.0      ┆ xx   ┆ 2020-01-01 ┆ 10:20:30 │
+        │ 25%        ┆ 2.8      ┆ 40.0     ┆ null     ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
+        │ 50%        ┆ 2.8      ┆ 50.0     ┆ null     ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
+        │ 75%        ┆ 3.0      ┆ 50.0     ┆ null     ┆ null ┆ 2022-12-31 ┆ 23:15:10 │
+        │ max        ┆ 3.0      ┆ 50.0     ┆ 1.0      ┆ zz   ┆ 2022-12-31 ┆ 23:15:10 │
+        └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
+
+        Customize which percentiles are displayed, applying linear interpolation:
+
+        >>> df.describe(
+        ...     percentiles=[0.1, 0.3, 0.5, 0.7, 0.9],
+        ...     interpolation="linear",
+        ... )
+        shape: (11, 7)
+        ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐
+        │ statistic  ┆ float    ┆ int      ┆ bool     ┆ str  ┆ date       ┆ time     │
+        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---        ┆ ---      │
+        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str        ┆ str      │
+        ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 3    ┆ 3          ┆ 3        │
+        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 0    ┆ 0          ┆ 0        │
+        │ mean       ┆ 2.266667 ┆ 45.0     ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │
+        │ std        ┆ 1.101514 ┆ 7.071068 ┆ null     ┆ null ┆ null       ┆ null     │
+        │ min        ┆ 1.0      ┆ 40.0     ┆ 0.0      ┆ xx   ┆ 2020-01-01 ┆ 10:20:30 │
+        │ 10%        ┆ 1.36     ┆ 41.0     ┆ null     ┆ null ┆ 2020-04-20 ┆ 11:13:34 │
+        │ 30%        ┆ 2.08     ┆ 43.0     ┆ null     ┆ null ┆ 2020-11-26 ┆ 12:59:42 │
+        │ 50%        ┆ 2.8      ┆ 45.0     ┆ null     ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
+        │ 70%        ┆ 2.88     ┆ 47.0     ┆ null     ┆ null ┆ 2022-02-07 ┆ 18:09:34 │
+        │ 90%        ┆ 2.96     ┆ 49.0     ┆ null     ┆ null ┆ 2022-09-13 ┆ 21:33:18 │
+        │ max        ┆ 3.0      ┆ 50.0     ┆ 1.0      ┆ zz   ┆ 2022-12-31 ┆ 23:15:10 │
+        └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
         """
         if not self.columns:
             msg = "cannot describe a DataFrame without any columns"
             raise TypeError(msg)
 
-        # Determine which columns should get std/mean/percentile statistics
-        stat_cols, temporal_cols, bool_cols, numeric_result = set(), set(), set(), set()
-        schema = self.schema
-        for c, dt in schema.items():
-            if dt.is_numeric():
-                stat_cols.add(c)
-                numeric_result.add(c)
-            elif dt.is_temporal():
-                temporal_cols.add(c)
-            elif dt == Boolean:
-                bool_cols.add(c)
-                numeric_result.add(c)
-            elif dt == Null or dt.is_nested():
-                numeric_result.add(c)
-
-        # Determine metrics and optional/additional percentiles
+        # create list of metrics
         metrics = ["count", "null_count", "mean", "std", "min"]
-        percentile_exprs = []
-        for p in parse_percentiles(percentiles):
-            for c in self.columns:
-                if c in stat_cols:
-                    expr = F.col(c).quantile(p)
-                elif p == 0.5 and c in temporal_cols:
-                    expr = F.col(c).to_physical().median().cast(schema[c])
-                else:
-                    expr = F.lit(None)
-                expr = expr.alias(f"{p}:{c}")
-                percentile_exprs.append(expr)
-            metrics.append(f"{p*100:g}%")
+        if quantiles := parse_percentiles(percentiles):
+            metrics.extend(f"{q * 100:g}%" for q in quantiles)
         metrics.append("max")
 
-        mean_exprs = [
-            (
-                F.col(c).mean() if (c in stat_cols or c in bool_cols) else F.lit(None)
-            ).alias(f"mean:{c}")
-            for c in self.columns
-        ]
-        std_exprs = [
-            (F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
-            for c in self.columns
-        ]
-        minmax_cols = {
-            c
-            for c, dt in self.schema.items()
-            if not dt.is_nested()
-            and dt not in (Object, Null, Unknown, Categorical, Enum, Boolean)
-        }
-        min_exprs = [
-            (F.col(c).min() if c in minmax_cols else F.lit(None)).alias(f"min:{c}")
-            for c in self.columns
-        ]
-        max_exprs = [
-            (F.col(c).max() if c in minmax_cols else F.lit(None)).alias(f"max:{c}")
-            for c in self.columns
-        ]
+        @lru_cache
+        def skip_minmax(dt: PolarsDataType) -> bool:
+            return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum)
 
-        # Calculate metrics in parallel
-        df_metrics = self.select(
-            F.all().count().name.prefix("count:"),
-            F.all().null_count().name.prefix("null_count:"),
-            *mean_exprs,
-            *std_exprs,
-            *min_exprs,
-            *percentile_exprs,
-            *max_exprs,
-        )
+        # determine which columns get std/mean/percentile stats
+        numeric_result = set()
+        metric_exprs = []
+        null = F.lit(None)
+
+        for c, dt in self.schema.items():
+            is_numeric = dt.is_numeric()
+            is_temporal = not is_numeric and dt.is_temporal()
+
+            # counts
+            count_exprs = [
+                F.col(c).count().name.prefix("count:"),
+                F.col(c).null_count().name.prefix("null_count:"),
+            ]
+            metric_exprs.extend(count_exprs)
+
+            # mean
+            if is_temporal:
+                mean_expr = F.col(c).to_physical().mean().cast(dt)
+            else:
+                mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null
+            metric_exprs.append(mean_expr.alias(f"mean:{c}"))
+
+            # standard deviation
+            expr_std = F.col(c).std() if is_numeric else null
+            metric_exprs.append(expr_std.alias(f"std:{c}"))
+
+            # min
+            min_expr = F.col(c).min() if not skip_minmax(dt) else null
+            metric_exprs.append(min_expr.alias(f"min:{c}"))
+
+            # percentiles
+            for p in quantiles:
+                pct_expr = (
+                    (
+                        F.col(c).to_physical().quantile(p, interpolation).cast(dt)
+                        if is_temporal
+                        else F.col(c).quantile(p, interpolation)
+                    )
+                    if (is_numeric or is_temporal)
+                    else null
+                )
+                metric_exprs.append(pct_expr.alias(f"{p}:{c}"))
+
+            # max
+            metric_exprs.append(
+                (F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}")
+            )
+
+            if is_numeric or dt.is_nested() or dt in (Null, Boolean):
+                numeric_result.add(c)
+
+        # calculate metrics in parallel
+        df_metrics = self.select(*metric_exprs)
 
-        # Reshape wide result
-        described = [
-            df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
-            for n in range(len(metrics))
+        # reshape wide result
+        n_metrics = len(metrics)
+        column_metrics = [
+            df_metrics.row(0)[(n * n_metrics) : (n + 1) * n_metrics]
+            for n in range(self.width)
         ]
+        summary = dict(zip(self.columns, column_metrics))
 
-        # Cast by column type (numeric/bool -> float), (other -> string)
-        summary = dict(zip(self.columns, list(zip(*described))))
+        # cast by column type (numeric/bool -> float), (other -> string)
         for c in self.columns:
             summary[c] = [  # type: ignore[assignment]
                 None
@@ -4495,9 +4528,9 @@ def describe(
                 for v in summary[c]
             ]
 
-        # Return results as a DataFrame
+        # return results as a DataFrame
         df_summary = self._from_dict(summary)
-        df_summary.insert_column(0, pl.Series("describe", metrics))
+        df_summary.insert_column(0, pl.Series("statistic", metrics))
         return df_summary
 
     def get_column_index(self, name: str) -> int:
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index a8199a48ea69d..c27bbaab1a400 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -1818,7 +1818,9 @@ def to_frame(self, name: str | None = None) -> DataFrame:
         return wrap_df(PyDataFrame([self._s]))
 
     def describe(
-        self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
+        self,
+        percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
+        interpolation: RollingInterpolationMethod = "nearest",
     ) -> DataFrame:
         """
         Quick summary statistics of a Series.
@@ -1831,6 +1833,8 @@ def describe(
         percentiles
             One or more percentiles to include in the summary statistics (if the
             Series has a numeric dtype). All values must be in the range `[0, 1]`.
+        interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
+            Interpolation method used when calculating percentiles.
 
         Notes
         -----
diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py
index 8254e8df18ebd..bf1cf1d2bf3a2 100644
--- a/py-polars/tests/unit/dataframe/test_describe.py
+++ b/py-polars/tests/unit/dataframe/test_describe.py
@@ -29,7 +29,7 @@ def test_df_describe() -> None:
     result = df.describe()
     expected = pl.DataFrame(
         {
-            "describe": [
+            "statistic": [
                 "count",
                 "null_count",
                 "mean",
@@ -52,7 +52,7 @@ def test_df_describe() -> None:
                 3.0,
             ],
             "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
-            "c": [3.0, 0.0, 2 / 3, None, None, None, None, None, None],
+            "c": [3.0, 0.0, 2 / 3, None, False, None, None, None, True],
             "d": ["2", "1", None, None, "b", None, None, None, "c"],
             "e": ["2", "1", None, None, None, None, None, None, None],
             "f": [
@@ -61,9 +61,9 @@ def test_df_describe() -> None:
                 None,
                 None,
                 "2020-01-01 10:30:00",
-                None,
                 "2021-07-05 15:00:00",
-                None,
+                "2021-07-05 15:00:00",
+                "2022-12-31 20:30:00",
                 "2022-12-31 20:30:00",
             ],
             "g": [
@@ -72,12 +72,22 @@ def test_df_describe() -> None:
                 None,
                 None,
                 "2020-01-01",
-                None,
                 "2021-07-05",
-                None,
+                "2021-07-05",
+                "2022-12-31",
                 "2022-12-31",
             ],
-            "h": ["3", "0", None, None, "10:30:00", None, "15:00:00", None, "20:30:00"],
+            "h": [
+                "3",
+                "0",
+                None,
+                None,
+                "10:30:00",
+                "15:00:00",
+                "15:00:00",
+                "20:30:00",
+                "20:30:00",
+            ],
         }
     )
     assert_frame_equal(result, expected)
@@ -103,7 +113,7 @@ def test_df_describe_nested() -> None:
             ("75%", None, None),
             ("max", None, None),
         ],
-        schema=["describe"] + df.columns,
+        schema=["statistic"] + df.columns,
         schema_overrides={"struct": pl.Float64, "list": pl.Float64},
     )
     assert_frame_equal(result, expected)
@@ -126,7 +136,7 @@ def test_df_describe_custom_percentiles() -> None:
             ("80%", 2.0),
             ("max", 2.0),
         ],
-        schema=["describe"] + df.columns,
+        schema=["statistic"] + df.columns,
     )
     assert_frame_equal(result, expected)
 
@@ -144,7 +154,7 @@ def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:
             ("min", 1.0),
             ("max", 2.0),
         ],
-        schema=["describe"] + df.columns,
+        schema=["statistic"] + df.columns,
     )
     assert_frame_equal(result, expected)
 
@@ -164,7 +174,7 @@ def test_df_describe_empty_column() -> None:
             ("75%", None),
             ("max", None),
         ],
-        schema=["describe"] + df.columns,
+        schema=["statistic"] + df.columns,
     )
     assert_frame_equal(result, expected)
 
@@ -180,7 +190,7 @@ def test_df_describe_empty() -> None:
 def test_df_describe_quantile_precision() -> None:
     df = pl.DataFrame({"a": range(10)})
     result = df.describe(percentiles=[0.99, 0.999, 0.9999])
-    result_metrics = result.get_column("describe").to_list()
+    result_metrics = result.get_column("statistic").to_list()
     expected_metrics = ["99%", "99.9%", "99.99%"]
     for m in expected_metrics:
         assert m in result_metrics
@@ -196,5 +206,7 @@ def test_df_describe_object() -> None:
 
     result = df.describe(percentiles=(0.05, 0.25, 0.5, 0.75, 0.95))
 
-    expected = pl.DataFrame({"describe": ["count", "null_count"], "object": ["3", "0"]})
+    expected = pl.DataFrame(
+        {"statistic": ["count", "null_count"], "object": ["3", "0"]}
+    )
     assert_frame_equal(result.head(2), expected)
diff --git a/py-polars/tests/unit/series/test_describe.py b/py-polars/tests/unit/series/test_describe.py
index 00b0c376ee771..06730549089e0 100644
--- a/py-polars/tests/unit/series/test_describe.py
+++ b/py-polars/tests/unit/series/test_describe.py
@@ -64,6 +64,8 @@ def test_series_describe_boolean() -> None:
         "count": 4,
         "null_count": 1,
         "mean": 0.75,
+        "min": False,
+        "max": True,
     }
     expected = pl.DataFrame(
         data={"statistic": stats.keys(), "value": stats.values()},
@@ -80,7 +82,9 @@ def test_series_describe_date() -> None:
         "count": "3",
         "null_count": "0",
         "min": "2021-01-01",
+        "25%": "2021-01-02",
         "50%": "2021-01-02",
+        "75%": "2021-01-03",
         "max": "2021-01-03",
     }
     expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})