From 5fda04e5f52273a11dd9475a1823083ee971f6be Mon Sep 17 00:00:00 2001 From: taki Date: Sun, 21 Jan 2024 12:49:41 +0100 Subject: [PATCH] Support `mean` for `bool`columns --- py-polars/polars/dataframe/frame.py | 43 +++++++++++-------- py-polars/polars/datatypes/classes.py | 5 +++ py-polars/polars/series/series.py | 11 +++++ py-polars/src/series/mod.rs | 4 ++ .../tests/unit/dataframe/test_describe.py | 2 +- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 220a0b5edf3ad..514840d2bfdcf 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4391,6 +4391,9 @@ def describe( ----- The median is included by default as the 50% percentile. + The mean for boolean columns is the ratio of true values + to the total non-null values. + Warnings -------- We will never guarantee the output of describe to be stable. @@ -4416,21 +4419,21 @@ def describe( ... ) >>> df.describe() shape: (9, 7) - ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ - │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ false ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ true ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ """ if not self.columns: msg = "cannot describe a DataFrame without any columns" @@ -4438,7 +4441,8 @@ def describe( # Determine which columns should get std/mean/percentile statistics stat_cols = {c for c, dt in self.schema.items() if dt.is_numeric()} - + # Determine bool columns to include mean statistics + bool_cols = {c for c, dt in self.schema.items() if dt.is_bool()} # Determine metrics and optional/additional percentiles metrics = ["count", "null_count", "mean", "std", "min"] percentile_exprs = [] @@ -4451,7 +4455,9 @@ def describe( metrics.append("max") mean_exprs = [ - (F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}") + ( + F.col(c).mean() if c in stat_cols or c in bool_cols else F.lit(None) + ).alias(f"mean:{c}") for c in self.columns ] std_exprs = [ @@ -4490,14 +4496,13 @@ def describe( df_metrics.row(0)[(n * self.width) : (n + 1) * self.width] for n in range(len(metrics)) ] - # Cast by column type (numeric/bool -> float), (other -> string) summary = dict(zip(self.columns, list(zip(*described)))) for c in self.columns: summary[c] = [ # type: ignore[assignment] None if (v is None or isinstance(v, dict)) - else (float(v) if c in stat_cols else str(v)) + else (float(v) if c in stat_cols else pl.Series([v])._str_value(0)) for v in summary[c] ] diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index 00bcfecda4b55..82800450b2a73 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -218,6 +218,11 @@ def is_nested(cls) -> bool: """Check whether the data type is a nested type.""" return issubclass(cls, NestedType) + @classmethod + def is_bool(cls) -> bool: + """Check whether the data type is a boolean type.""" + return issubclass(cls, Boolean) + def _custom_reconstruct( cls: type[Any], base: type[Any], state: Any diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index fb8f703c7a165..a7d5849bf4021 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -382,6 +382,17 @@ def _from_pandas( pandas_to_pyseries(name, values, nan_to_null=nan_to_null) ) + def _str_value(self, index: int) -> str: + """ + Return a string representation of the elements at `index`. + + Returns + ------- + str + string representation of an element. + """ + return self._s.str_value(index) + def _get_buffer_info(self) -> BufferInfo: """ Return pointer, offset, and length information about the underlying buffer. diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs index 31f57a8896d76..0a2b7db6648b5 100644 --- a/py-polars/src/series/mod.rs +++ b/py-polars/src/series/mod.rs @@ -145,6 +145,10 @@ impl PySeries { } } + fn str_value(&self, index: usize) -> String { + self.series.str_value(index).unwrap().into() + } + fn rechunk(&mut self, in_place: bool) -> Option { let series = self.series.rechunk(); if in_place { diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py index b5f0d8360848d..a1e30b700861c 100644 --- a/py-polars/tests/unit/dataframe/test_describe.py +++ b/py-polars/tests/unit/dataframe/test_describe.py @@ -48,7 +48,7 @@ def test_df_describe() -> None: 3.0, ], "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0], - "c": ["3", "0", None, None, "False", None, None, None, "True"], + "c": ["3", "0", "0.666667", None, "false", None, None, None, "true"], "d": ["2", "1", None, None, "b", None, None, None, "c"], "e": ["2", "1", None, None, None, None, None, None, None], "f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"],