diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 8ccd0d99c44d..b682ff006019 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -6,6 +6,7 @@ import random from collections import OrderedDict, defaultdict from collections.abc import Sized +from functools import lru_cache from io import BytesIO, StringIO, TextIOWrapper from operator import itemgetter from pathlib import Path @@ -4357,7 +4358,10 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]: return None def describe( - self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75) + self, + percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75), + *, + interpolation: RollingInterpolationMethod = "nearest", ) -> Self: """ Summary statistics for a DataFrame. @@ -4368,15 +4372,17 @@ def describe( One or more percentiles to include in the summary statistics. All values must be in the range `[0, 1]`. + interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} + Interpolation method used when calculating percentiles. + Notes ----- The median is included by default as the 50% percentile. Warnings -------- - We will never guarantee the output of describe to be stable. - It will show statistics that we deem informative and may - be updated in the future. + We do not guarantee the output of `describe` to be stable. It will show + statistics that we deem informative, and may be updated in the future. See Also -------- @@ -4384,117 +4390,160 @@ def describe( Examples -------- - >>> from datetime import date + >>> from datetime import date, time >>> df = pl.DataFrame( ... { ... "float": [1.0, 2.8, 3.0], - ... "int": [4, 5, None], + ... "int": [40, 50, None], ... "bool": [True, False, True], - ... "str": [None, "b", "c"], - ... "str2": ["usd", "eur", None], - ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... "str": ["zz", "xx", "yy"], + ... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + ... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)], ... } ... ) + + Show default frame statistics: + >>> df.describe() shape: (9, 7) - ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ - │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + + Customize which percentiles are displayed, applying linear interpolation: + + >>> df.describe( + ... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9], + ... interpolation="linear", + ... ) + shape: (11, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ """ if not self.columns: msg = "cannot describe a DataFrame without any columns" raise TypeError(msg) - # Determine which columns should get std/mean/percentile statistics - stat_cols = {c for c, dt in self.schema.items() if dt.is_numeric()} - - # Determine metrics and optional/additional percentiles + # create list of metrics metrics = ["count", "null_count", "mean", "std", "min"] - percentile_exprs = [] - - percentiles = parse_percentiles(percentiles) - for p in percentiles: - for c in self.columns: - expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None) - expr = expr.alias(f"{p}:{c}") - percentile_exprs.append(expr) - metrics.append(f"{p * 100:g}%") + if quantiles := parse_percentiles(percentiles): + metrics.extend(f"{q * 100:g}%" for q in quantiles) metrics.append("max") - mean_exprs = [ - (F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}") - for c in self.columns - ] - std_exprs = [ - (F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}") - for c in self.columns - ] + @lru_cache + def skip_minmax(dt: PolarsDataType) -> bool: + return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum) - minmax_cols = { - c - for c, dt in self.schema.items() - if not dt.is_nested() - and dt not in (Object, Null, Unknown, Categorical, Enum) - } - min_exprs = [ - (F.col(c).min() if c in minmax_cols else F.lit(None)).alias(f"min:{c}") - for c in self.columns - ] - max_exprs = [ - (F.col(c).max() if c in minmax_cols else F.lit(None)).alias(f"max:{c}") - for c in self.columns - ] + # determine which columns get std/mean/percentile stats + has_numeric_result, sort_cols = set(), set() + metric_exprs = [] + null = F.lit(None) - # If more than one quantile is requested, - # sort numerical columns to make them O(1). - # TODO: Should be removed once Polars supports - # getting multiples quantiles at once. - sort_exprs = [ - (F.col(c).sort() if len(percentiles) > 1 and c in stat_cols else F.col(c)) - for c in self.columns - ] - # Calculate metrics in parallel - df_metrics = self.select(*sort_exprs).select( - F.all().count().name.prefix("count:"), - F.all().null_count().name.prefix("null_count:"), - *mean_exprs, - *std_exprs, - *min_exprs, - *percentile_exprs, - *max_exprs, + for c, dt in self.schema.items(): + is_numeric = dt.is_numeric() + is_temporal = not is_numeric and dt.is_temporal() + + # counts + count_exprs = [ + F.col(c).count().name.prefix("count:"), + F.col(c).null_count().name.prefix("null_count:"), + ] + metric_exprs.extend(count_exprs) + + # mean + if is_temporal: + mean_expr = F.col(c).to_physical().mean().cast(dt) + else: + mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null + metric_exprs.append(mean_expr.alias(f"mean:{c}")) + + # standard deviation + expr_std = F.col(c).std() if is_numeric else null + metric_exprs.append(expr_std.alias(f"std:{c}")) + + # min + min_expr = F.col(c).min() if not skip_minmax(dt) else null + metric_exprs.append(min_expr.alias(f"min:{c}")) + + # percentiles + for p in quantiles: + if is_numeric or is_temporal: + pct_expr = ( + F.col(c).to_physical().quantile(p, interpolation).cast(dt) + if is_temporal + else F.col(c).quantile(p, interpolation) + ) + sort_cols.add(c) + else: + pct_expr = null + metric_exprs.append(pct_expr.alias(f"{p}:{c}")) + + # max + metric_exprs.append( + (F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}") + ) + + if is_numeric or dt.is_nested() or dt in (Null, Boolean): + has_numeric_result.add(c) + + # if more than one quantile requested, sort relevant columns to make them O(1) + # TODO: remove once we have engine support for retrieving multiples quantiles + lf = ( + self.lazy().with_columns(F.col(c).sort() for c in sort_cols) + if sort_cols + else self.lazy() ) - # Reshape wide result - described = [ - df_metrics.row(0)[(n * self.width) : (n + 1) * self.width] - for n in range(len(metrics)) + # calculate metrics in parallel + df_metrics = lf.select(*metric_exprs).collect() + + # reshape wide result + n_metrics = len(metrics) + column_metrics = [ + df_metrics.row(0)[(n * n_metrics) : (n + 1) * n_metrics] + for n in range(self.width) ] + summary = dict(zip(self.columns, column_metrics)) - # Cast by column type (numeric/bool -> float), (other -> string) - summary = dict(zip(self.columns, list(zip(*described)))) + # cast by column type (numeric/bool -> float), (other -> string) for c in self.columns: summary[c] = [ # type: ignore[assignment] None if (v is None or isinstance(v, dict)) - else (float(v) if c in stat_cols else str(v)) + else (float(v) if (c in has_numeric_result) else str(v)) for v in summary[c] ] - # Return results as a DataFrame + # return results as a DataFrame df_summary = self._from_dict(summary) - df_summary.insert_column(0, pl.Series("describe", metrics)) + df_summary.insert_column(0, pl.Series("statistic", metrics)) return df_summary def get_column_index(self, name: str) -> int: diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 1c0f1aa13014..1bdacbd4e506 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -102,7 +102,6 @@ from polars.utils.various import ( _is_generator, no_default, - parse_percentiles, parse_version, range_to_series, range_to_slice, @@ -1870,7 +1869,9 @@ def to_frame(self, name: str | None = None) -> DataFrame: return wrap_df(PyDataFrame([self._s])) def describe( - self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75) + self, + percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75), + interpolation: RollingInterpolationMethod = "nearest", ) -> DataFrame: """ Quick summary statistics of a Series. @@ -1883,6 +1884,8 @@ def describe( percentiles One or more percentiles to include in the summary statistics (if the Series has a numeric dtype). All values must be in the range `[0, 1]`. + interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} + Interpolation method used when calculating percentiles. Notes ----- @@ -1916,68 +1919,26 @@ def describe( Non-numeric data types may not have all statistics available. - >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s = pl.Series(["aa", "aa", None, "bb", "cc"]) >>> s.describe() - shape: (3, 2) + shape: (4, 2) ┌────────────┬───────┐ │ statistic ┆ value │ │ --- ┆ --- │ - │ str ┆ i64 │ + │ str ┆ str │ ╞════════════╪═══════╡ │ count ┆ 4 │ │ null_count ┆ 1 │ - │ unique ┆ 4 │ + │ min ┆ aa │ + │ max ┆ cc │ └────────────┴───────┘ """ - stats: dict[str, PythonLiteral | None] - stats_dtype: PolarsDataType - - if self.dtype.is_numeric(): - stats_dtype = Float64 - stats = { - "count": self.count(), - "null_count": self.null_count(), - "mean": self.mean(), - "std": self.std(), - "min": self.min(), - } - for p in parse_percentiles(percentiles): - stats[f"{p:.0%}"] = self.quantile(p) - stats["max"] = self.max() - - elif self.dtype == Boolean: - stats_dtype = Int64 - stats = { - "count": self.count(), - "null_count": self.null_count(), - "sum": self.sum(), - } - elif self.dtype == String: - stats_dtype = Int64 - stats = { - "count": self.count(), - "null_count": self.null_count(), - "unique": self.n_unique(), - } - elif self.dtype.is_temporal(): - # we coerce all to string, because a polars column - # only has a single dtype and dates: datetime and count: int don't match - stats_dtype = String - stats = { - "count": str(self.count()), - "null_count": str(self.null_count()), - "min": str(self.dt.min()), - "50%": str(self.dt.median()), - "max": str(self.dt.max()), - } - else: - msg = f"cannot describe Series of data type {self.dtype}" - raise TypeError(msg) - - return pl.DataFrame( - {"statistic": stats.keys(), "value": stats.values()}, - schema={"statistic": String, "value": stats_dtype}, + stats = self.to_frame().describe( + percentiles=percentiles, + interpolation=interpolation, ) + stats.columns = ["statistic", "value"] + return stats.filter(F.col("value").is_not_null()) def sum(self) -> int | float: """ diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py index b5f0d8360848..a9f855ffd054 100644 --- a/py-polars/tests/unit/dataframe/test_describe.py +++ b/py-polars/tests/unit/dataframe/test_describe.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datetime import date +from datetime import date, datetime, time import pytest @@ -16,16 +16,20 @@ def test_df_describe() -> None: "c": [True, False, True], "d": [None, "b", "c"], "e": ["usd", "eur", None], - "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + "f": [ + datetime(2020, 1, 1, 10, 30), + datetime(2021, 7, 5, 15, 0), + datetime(2022, 12, 31, 20, 30), + ], + "g": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + "h": [time(10, 30), time(15, 0), time(20, 30)], }, schema_overrides={"e": pl.Categorical}, ) - result = df.describe() - print(result) expected = pl.DataFrame( { - "describe": [ + "statistic": [ "count", "null_count", "mean", @@ -48,10 +52,42 @@ def test_df_describe() -> None: 3.0, ], "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0], - "c": ["3", "0", None, None, "False", None, None, None, "True"], + "c": [3.0, 0.0, 2 / 3, None, False, None, None, None, True], "d": ["2", "1", None, None, "b", None, None, None, "c"], "e": ["2", "1", None, None, None, None, None, None, None], - "f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"], + "f": [ + "3", + "0", + "2021-07-03 07:20:00", + None, + "2020-01-01 10:30:00", + "2021-07-05 15:00:00", + "2021-07-05 15:00:00", + "2022-12-31 20:30:00", + "2022-12-31 20:30:00", + ], + "g": [ + "3", + "0", + "2021-07-02", + None, + "2020-01-01", + "2021-07-05", + "2021-07-05", + "2022-12-31", + "2022-12-31", + ], + "h": [ + "3", + "0", + "15:20:00", + None, + "10:30:00", + "15:00:00", + "15:00:00", + "20:30:00", + "20:30:00", + ], } ) assert_frame_equal(result, expected) @@ -64,9 +100,7 @@ def test_df_describe_nested() -> None: "list": [[1, 2], [3, 4], [1, 2], None], } ) - result = df.describe() - expected = pl.DataFrame( [ ("count", 3, 3), @@ -79,17 +113,15 @@ def test_df_describe_nested() -> None: ("75%", None, None), ("max", None, None), ], - schema=["describe"] + df.columns, - schema_overrides={"struct": pl.String, "list": pl.String}, + schema=["statistic"] + df.columns, + schema_overrides={"struct": pl.Float64, "list": pl.Float64}, ) assert_frame_equal(result, expected) def test_df_describe_custom_percentiles() -> None: df = pl.DataFrame({"numeric": [1, 2, 1, None]}) - result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8)) - expected = pl.DataFrame( [ ("count", 3.0), @@ -104,7 +136,7 @@ def test_df_describe_custom_percentiles() -> None: ("80%", 2.0), ("max", 2.0), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, ) assert_frame_equal(result, expected) @@ -112,9 +144,7 @@ def test_df_describe_custom_percentiles() -> None: @pytest.mark.parametrize("pcts", [None, []]) def test_df_describe_no_percentiles(pcts: list[float] | None) -> None: df = pl.DataFrame({"numeric": [1, 2, 1, None]}) - result = df.describe(percentiles=pcts) - expected = pl.DataFrame( [ ("count", 3.0), @@ -124,16 +154,14 @@ def test_df_describe_no_percentiles(pcts: list[float] | None) -> None: ("min", 1.0), ("max", 2.0), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, ) assert_frame_equal(result, expected) def test_df_describe_empty_column() -> None: df = pl.DataFrame(schema={"a": pl.Int64}) - result = df.describe() - expected = pl.DataFrame( [ ("count", 0.0), @@ -146,7 +174,7 @@ def test_df_describe_empty_column() -> None: ("75%", None), ("max", None), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, ) assert_frame_equal(result, expected) @@ -162,7 +190,7 @@ def test_df_describe_empty() -> None: def test_df_describe_quantile_precision() -> None: df = pl.DataFrame({"a": range(10)}) result = df.describe(percentiles=[0.99, 0.999, 0.9999]) - result_metrics = result.get_column("describe").to_list() + result_metrics = result.get_column("statistic").to_list() expected_metrics = ["99%", "99.9%", "99.99%"] for m in expected_metrics: assert m in result_metrics @@ -178,5 +206,7 @@ def test_df_describe_object() -> None: result = df.describe(percentiles=(0.05, 0.25, 0.5, 0.75, 0.95)) - expected = pl.DataFrame({"describe": ["count", "null_count"], "object": ["3", "0"]}) + expected = pl.DataFrame( + {"statistic": ["count", "null_count"], "object": ["3", "0"]} + ) assert_frame_equal(result.head(2), expected) diff --git a/py-polars/tests/unit/series/test_describe.py b/py-polars/tests/unit/series/test_describe.py index 1cd20ffe6825..15ed7bc84c54 100644 --- a/py-polars/tests/unit/series/test_describe.py +++ b/py-polars/tests/unit/series/test_describe.py @@ -1,7 +1,5 @@ from datetime import date -import pytest - import polars as pl from polars.testing.asserts.frame import assert_frame_equal @@ -49,9 +47,10 @@ def test_series_describe_string() -> None: result = s.describe() stats = { - "count": 3, - "null_count": 0, - "unique": 3, + "count": "3", + "null_count": "0", + "min": "abc", + "max": "xyz", } expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()}) assert_frame_equal(expected, result) @@ -64,22 +63,30 @@ def test_series_describe_boolean() -> None: stats = { "count": 4, "null_count": 1, - "sum": 3, + "mean": 0.75, + "min": False, + "max": True, } - expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()}) + expected = pl.DataFrame( + data={"statistic": stats.keys(), "value": stats.values()}, + schema_overrides={"value": pl.Float64}, + ) assert_frame_equal(expected, result) def test_series_describe_date() -> None: - s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - result = s.describe() + s = pl.Series([date(1999, 12, 31), date(2011, 3, 11), date(2021, 1, 18)]) + result = s.describe(interpolation="linear") stats = { "count": "3", "null_count": "0", - "min": "2021-01-01", - "50%": "2021-01-02", - "max": "2021-01-03", + "mean": "2010-09-29", + "min": "1999-12-31", + "25%": "2005-08-05", + "50%": "2011-03-11", + "75%": "2016-02-13", + "max": "2021-01-18", } expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()}) assert_frame_equal(expected, result) @@ -88,25 +95,34 @@ def test_series_describe_date() -> None: def test_series_describe_empty() -> None: s = pl.Series(dtype=pl.Float64) result = s.describe() - print(result) stats = { "count": 0.0, "null_count": 0.0, - "mean": None, - "std": None, - "min": None, - "25%": None, - "50%": None, - "75%": None, - "max": None, } expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()}) assert_frame_equal(expected, result) -def test_series_describe_unsupported_dtype() -> None: - s = pl.Series(dtype=pl.List(pl.Int64)) - with pytest.raises( - TypeError, match="cannot describe Series of data type List\\(Int64\\)" - ): - s.describe() +def test_series_describe_null() -> None: + s = pl.Series([None, None], dtype=pl.Null) + result = s.describe() + stats = { + "count": 0.0, + "null_count": 2.0, + } + expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()}) + assert_frame_equal(expected, result) + + +def test_series_describe_nested_list() -> None: + s = pl.Series( + values=[[10e10, 10e15], [10e12, 10e13], [10e10, 10e15]], + dtype=pl.List(pl.Int64), + ) + result = s.describe() + stats = { + "count": 3.0, + "null_count": 0.0, + } + expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()}) + assert_frame_equal(expected, result)