From 264af20718c54d5abcca9b50547ff332231a9fef Mon Sep 17 00:00:00 2001 From: Jeroen van Zundert Date: Mon, 10 Apr 2023 18:51:23 +0200 Subject: [PATCH] feat(python): Add median stat to Series.describe Plus a small reordering of the Series.describe output to match DataFrame.describe ordering as much as possible. --- py-polars/polars/series/series.py | 33 ++++++++++++++++------------- py-polars/tests/unit/test_series.py | 3 +++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 86580b70757e..1aa28406e01f 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1157,18 +1157,19 @@ def describe(self) -> DataFrame: -------- >>> series_num = pl.Series([1, 2, 3, 4, 5]) >>> series_num.describe() - shape: (6, 2) + shape: (7, 2) ┌────────────┬──────────┐ │ statistic ┆ value │ │ --- ┆ --- │ │ str ┆ f64 │ ╞════════════╪══════════╡ - │ min ┆ 1.0 │ - │ max ┆ 5.0 │ + │ count ┆ 5.0 │ │ null_count ┆ 0.0 │ │ mean ┆ 3.0 │ │ std ┆ 1.581139 │ - │ count ┆ 5.0 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ └────────────┴──────────┘ >>> series_str = pl.Series(["a", "a", None, "b", "c"]) @@ -1179,9 +1180,9 @@ def describe(self) -> DataFrame: │ --- ┆ --- │ │ str ┆ i64 │ ╞════════════╪═══════╡ - │ unique ┆ 4 │ - │ null_count ┆ 1 │ │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ └────────────┴───────┘ """ @@ -1192,33 +1193,35 @@ def describe(self) -> DataFrame: elif self.is_numeric(): s = self.cast(Float64) stats = { - "min": s.min(), - "max": s.max(), + "count": s.len(), "null_count": s.null_count(), "mean": s.mean(), "std": s.std(), - "count": s.len(), + "min": s.min(), + "max": s.max(), + "median": s.median(), } elif self.is_boolean(): stats = { - "sum": self.sum(), - "null_count": self.null_count(), "count": self.len(), + "null_count": self.null_count(), + "sum": self.sum(), } elif self.is_utf8(): stats = { - "unique": len(self.unique()), - "null_count": self.null_count(), "count": self.len(), + "null_count": self.null_count(), + "unique": len(self.unique()), } elif self.is_temporal(): # we coerce all to string, because a polars column # only has a single dtype and dates: datetime and count: int don't match stats = { + "count": str(self.len()), + "null_count": str(self.null_count()), "min": str(self.dt.min()), "max": str(self.dt.max()), - "null_count": str(self.null_count()), - "count": str(self.len()), + "median": str(self.dt.median()), } else: raise TypeError("This type is not supported") diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py index c8843439a047..cd01500b6ed8 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/test_series.py @@ -1130,6 +1130,7 @@ def test_describe() -> None: "min": 1.0, "null_count": 0.0, "std": 1.0, + "median": 2.0, } assert dict(float_s.describe().rows()) == { # type: ignore[arg-type] "count": 3.0, @@ -1138,6 +1139,7 @@ def test_describe() -> None: "min": 1.3, "null_count": 0.0, "std": 3.8109491381194442, + "median": 4.6, } assert dict(str_s.describe().rows()) == { # type: ignore[arg-type] "count": 3, @@ -1153,6 +1155,7 @@ def test_describe() -> None: "count": "3", "max": "2021-01-03", "min": "2021-01-01", + "median": "2021-01-02", "null_count": "0", }