From 5fda04e5f52273a11dd9475a1823083ee971f6be Mon Sep 17 00:00:00 2001
From: taki <taki.mekhalfa@dataimpact.io>
Date: Sun, 21 Jan 2024 12:49:41 +0100
Subject: [PATCH] Support `mean` for `bool`columns

---
 py-polars/polars/dataframe/frame.py           | 43 +++++++++++--------
 py-polars/polars/datatypes/classes.py         |  5 +++
 py-polars/polars/series/series.py             | 11 +++++
 py-polars/src/series/mod.rs                   |  4 ++
 .../tests/unit/dataframe/test_describe.py     |  2 +-
 5 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 220a0b5edf3ad..514840d2bfdcf 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -4391,6 +4391,9 @@ def describe(
         -----
         The median is included by default as the 50% percentile.
 
+        The mean for boolean columns is the ratio of true values
+        to the total non-null values.
+
         Warnings
         --------
         We will never guarantee the output of describe to be stable.
@@ -4416,21 +4419,21 @@ def describe(
         ... )
         >>> df.describe()
         shape: (9, 7)
-        ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐
-        │ describe   ┆ float    ┆ int      ┆ bool  ┆ str  ┆ str2 ┆ date       │
-        │ ---        ┆ ---      ┆ ---      ┆ ---   ┆ ---  ┆ ---  ┆ ---        │
-        │ str        ┆ f64      ┆ f64      ┆ str   ┆ str  ┆ str  ┆ str        │
-        ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡
-        │ count      ┆ 3.0      ┆ 2.0      ┆ 3     ┆ 2    ┆ 2    ┆ 3          │
-        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0     ┆ 1    ┆ 1    ┆ 0          │
-        │ mean       ┆ 2.266667 ┆ 4.5      ┆ null  ┆ null ┆ null ┆ null       │
-        │ std        ┆ 1.101514 ┆ 0.707107 ┆ null  ┆ null ┆ null ┆ null       │
-        │ min        ┆ 1.0      ┆ 4.0      ┆ False ┆ b    ┆ eur  ┆ 2020-01-01 │
-        │ 25%        ┆ 2.8      ┆ 4.0      ┆ null  ┆ null ┆ null ┆ null       │
-        │ 50%        ┆ 2.8      ┆ 5.0      ┆ null  ┆ null ┆ null ┆ null       │
-        │ 75%        ┆ 3.0      ┆ 5.0      ┆ null  ┆ null ┆ null ┆ null       │
-        │ max        ┆ 3.0      ┆ 5.0      ┆ True  ┆ c    ┆ usd  ┆ 2022-01-01 │
-        └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘
+        ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
+        │ describe   ┆ float    ┆ int      ┆ bool     ┆ str  ┆ str2 ┆ date       │
+        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
+        │ str        ┆ f64      ┆ f64      ┆ str      ┆ str  ┆ str  ┆ str        │
+        ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3        ┆ 2    ┆ 2    ┆ 3          │
+        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0        ┆ 1    ┆ 1    ┆ 0          │
+        │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
+        │ std        ┆ 1.101514 ┆ 0.707107 ┆ null     ┆ null ┆ null ┆ null       │
+        │ min        ┆ 1.0      ┆ 4.0      ┆ false    ┆ b    ┆ eur  ┆ 2020-01-01 │
+        │ 25%        ┆ 2.8      ┆ 4.0      ┆ null     ┆ null ┆ null ┆ null       │
+        │ 50%        ┆ 2.8      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ null       │
+        │ 75%        ┆ 3.0      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ null       │
+        │ max        ┆ 3.0      ┆ 5.0      ┆ true     ┆ c    ┆ usd  ┆ 2022-01-01 │
+        └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
         """
         if not self.columns:
             msg = "cannot describe a DataFrame without any columns"
@@ -4438,7 +4441,8 @@ def describe(
 
         # Determine which columns should get std/mean/percentile statistics
         stat_cols = {c for c, dt in self.schema.items() if dt.is_numeric()}
-
+        # Determine bool columns to include mean statistics
+        bool_cols = {c for c, dt in self.schema.items() if dt.is_bool()}
         # Determine metrics and optional/additional percentiles
         metrics = ["count", "null_count", "mean", "std", "min"]
         percentile_exprs = []
@@ -4451,7 +4455,9 @@ def describe(
         metrics.append("max")
 
         mean_exprs = [
-            (F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
+            (
+                F.col(c).mean() if c in stat_cols or c in bool_cols else F.lit(None)
+            ).alias(f"mean:{c}")
             for c in self.columns
         ]
         std_exprs = [
@@ -4490,14 +4496,13 @@ def describe(
             df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
             for n in range(len(metrics))
         ]
-
         # Cast by column type (numeric/bool -> float), (other -> string)
         summary = dict(zip(self.columns, list(zip(*described))))
         for c in self.columns:
             summary[c] = [  # type: ignore[assignment]
                 None
                 if (v is None or isinstance(v, dict))
-                else (float(v) if c in stat_cols else str(v))
+                else (float(v) if c in stat_cols else pl.Series([v])._str_value(0))
                 for v in summary[c]
             ]
 
diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py
index 00bcfecda4b55..82800450b2a73 100644
--- a/py-polars/polars/datatypes/classes.py
+++ b/py-polars/polars/datatypes/classes.py
@@ -218,6 +218,11 @@ def is_nested(cls) -> bool:
         """Check whether the data type is a nested type."""
         return issubclass(cls, NestedType)
 
+    @classmethod
+    def is_bool(cls) -> bool:
+        """Check whether the data type is a boolean type."""
+        return issubclass(cls, Boolean)
+
 
 def _custom_reconstruct(
     cls: type[Any], base: type[Any], state: Any
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index fb8f703c7a165..a7d5849bf4021 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -382,6 +382,17 @@ def _from_pandas(
             pandas_to_pyseries(name, values, nan_to_null=nan_to_null)
         )
 
+    def _str_value(self, index: int) -> str:
+        """
+        Return a string representation of the elements at `index`.
+
+        Returns
+        -------
+        str
+            string representation of an element.
+        """
+        return self._s.str_value(index)
+
     def _get_buffer_info(self) -> BufferInfo:
         """
         Return pointer, offset, and length information about the underlying buffer.
diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs
index 31f57a8896d76..0a2b7db6648b5 100644
--- a/py-polars/src/series/mod.rs
+++ b/py-polars/src/series/mod.rs
@@ -145,6 +145,10 @@ impl PySeries {
         }
     }
 
+    fn str_value(&self, index: usize) -> String {
+        self.series.str_value(index).unwrap().into()
+    }
+
     fn rechunk(&mut self, in_place: bool) -> Option<Self> {
         let series = self.series.rechunk();
         if in_place {
diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py
index b5f0d8360848d..a1e30b700861c 100644
--- a/py-polars/tests/unit/dataframe/test_describe.py
+++ b/py-polars/tests/unit/dataframe/test_describe.py
@@ -48,7 +48,7 @@ def test_df_describe() -> None:
                 3.0,
             ],
             "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
-            "c": ["3", "0", None, None, "False", None, None, None, "True"],
+            "c": ["3", "0", "0.666667", None, "false", None, None, None, "true"],
             "d": ["2", "1", None, None, "b", None, None, None, "c"],
             "e": ["2", "1", None, None, None, None, None, None, None],
             "f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"],