pola-rs · taki-mekhalfa · Jan 21, 2024 · Jan 21, 2024
@@ -4391,6 +4391,9 @@ def describe(
         -----
         The median is included by default as the 50% percentile.
 
+        The mean for boolean columns is the ratio of true values
+        to the total non-null values.
+
         Warnings
         --------
         We will never guarantee the output of describe to be stable.
@@ -4416,29 +4419,30 @@ def describe(
         ... )
         >>> df.describe()
         shape: (9, 7)
-        ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐
-        │ describe   ┆ float    ┆ int      ┆ bool  ┆ str  ┆ str2 ┆ date       │
-        │ ---        ┆ ---      ┆ ---      ┆ ---   ┆ ---  ┆ ---  ┆ ---        │
-        │ str        ┆ f64      ┆ f64      ┆ str   ┆ str  ┆ str  ┆ str        │
-        ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡
-        │ count      ┆ 3.0      ┆ 2.0      ┆ 3     ┆ 2    ┆ 2    ┆ 3          │
-        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0     ┆ 1    ┆ 1    ┆ 0          │
-        │ mean       ┆ 2.266667 ┆ 4.5      ┆ null  ┆ null ┆ null ┆ null       │
-        │ std        ┆ 1.101514 ┆ 0.707107 ┆ null  ┆ null ┆ null ┆ null       │
-        │ min        ┆ 1.0      ┆ 4.0      ┆ False ┆ b    ┆ eur  ┆ 2020-01-01 │
-        │ 25%        ┆ 2.8      ┆ 4.0      ┆ null  ┆ null ┆ null ┆ null       │
-        │ 50%        ┆ 2.8      ┆ 5.0      ┆ null  ┆ null ┆ null ┆ null       │
-        │ 75%        ┆ 3.0      ┆ 5.0      ┆ null  ┆ null ┆ null ┆ null       │
-        │ max        ┆ 3.0      ┆ 5.0      ┆ True  ┆ c    ┆ usd  ┆ 2022-01-01 │
-        └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘
+        ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
+        │ describe   ┆ float    ┆ int      ┆ bool     ┆ str  ┆ str2 ┆ date       │
+        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
+        │ str        ┆ f64      ┆ f64      ┆ str      ┆ str  ┆ str  ┆ str        │
+        ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3        ┆ 2    ┆ 2    ┆ 3          │
+        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0        ┆ 1    ┆ 1    ┆ 0          │
+        │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
+        │ std        ┆ 1.101514 ┆ 0.707107 ┆ null     ┆ null ┆ null ┆ null       │
+        │ min        ┆ 1.0      ┆ 4.0      ┆ false    ┆ b    ┆ eur  ┆ 2020-01-01 │
+        │ 25%        ┆ 2.8      ┆ 4.0      ┆ null     ┆ null ┆ null ┆ null       │
+        │ 50%        ┆ 2.8      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ null       │
+        │ 75%        ┆ 3.0      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ null       │
+        │ max        ┆ 3.0      ┆ 5.0      ┆ true     ┆ c    ┆ usd  ┆ 2022-01-01 │
+        └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
         """
         if not self.columns:
             msg = "cannot describe a DataFrame without any columns"
             raise TypeError(msg)
 
         # Determine which columns should get std/mean/percentile statistics
         stat_cols = {c for c, dt in self.schema.items() if dt.is_numeric()}
-
+        # Determine bool columns to include mean statistics
+        bool_cols = {c for c, dt in self.schema.items() if dt.is_bool()}
         # Determine metrics and optional/additional percentiles
         metrics = ["count", "null_count", "mean", "std", "min"]
         percentile_exprs = []
@@ -4451,7 +4455,9 @@ def describe(
         metrics.append("max")
 
         mean_exprs = [
-            (F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
+            (
+                F.col(c).mean() if c in stat_cols or c in bool_cols else F.lit(None)
+            ).alias(f"mean:{c}")
             for c in self.columns
         ]
         std_exprs = [
@@ -4490,14 +4496,13 @@ def describe(
             df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
             for n in range(len(metrics))
         ]
-
         # Cast by column type (numeric/bool -> float), (other -> string)
         summary = dict(zip(self.columns, list(zip(*described))))
         for c in self.columns:
             summary[c] = [  # type: ignore[assignment]
                 None
                 if (v is None or isinstance(v, dict))
-                else (float(v) if c in stat_cols else str(v))
+                else (float(v) if c in stat_cols else pl.Series([v])._str_value(0))
                 for v in summary[c]
             ]
 

@@ -218,6 +218,11 @@ def is_nested(cls) -> bool:
         """Check whether the data type is a nested type."""
         return issubclass(cls, NestedType)
 
+    @classmethod
+    def is_bool(cls) -> bool:
+        """Check whether the data type is a boolean type."""
+        return issubclass(cls, Boolean)
+
 
 def _custom_reconstruct(
     cls: type[Any], base: type[Any], state: Any

@@ -383,6 +383,17 @@ def _from_pandas(
             pandas_to_pyseries(name, values, nan_to_null=nan_to_null)
         )
 
+    def _str_value(self, index: int) -> str:
+        """
+        Return a string representation of the element at `index`.
+
+        Returns
+        -------
+        str
+            string representation of the value.
+        """
+        return self._s.str_value(index)
+
     def _get_buffer_info(self) -> BufferInfo:
         """
         Return pointer, offset, and length information about the underlying buffer.
@@ -1888,11 +1899,21 @@ def describe(
         -----
         The median is included by default as the 50% percentile.
 
+        The mean for boolean series is the ratio of true values
+        to the total non-null values.
+
+
         Returns
         -------
         DataFrame
             Mapping with summary statistics of a Series.
 
+        Warnings
+        --------
+        We will never guarantee the output of describe to be stable.
+        It will show statistics that we deem informative and may
+        be updated in the future.
+
         Examples
         --------
         >>> s = pl.Series([1, 2, 3, 4, 5])
@@ -1914,6 +1935,20 @@ def describe(
         │ max        ┆ 5.0      │
         └────────────┴──────────┘
 
+        >>> s = pl.Series([True, False, True, None, True])
+        >>> s.describe()
+        shape: (4, 2)
+        ┌────────────┬───────┐
+        │ statistic  ┆ value │
+        │ ---        ┆ ---   │
+        │ str        ┆ f64   │
+        ╞════════════╪═══════╡
+        │ count      ┆ 4.0   │
+        │ null_count ┆ 1.0   │
+        │ sum        ┆ 3.0   │
+        │ mean       ┆ 0.75  │
+        └────────────┴───────┘
+
         Non-numeric data types may not have all statistics available.
 
         >>> s = pl.Series(["a", "a", None, "b", "c"])
@@ -1946,11 +1981,12 @@ def describe(
             stats["max"] = self.max()
 
         elif self.dtype == Boolean:
-            stats_dtype = Int64
+            stats_dtype = Float64
             stats = {
                 "count": self.count(),
                 "null_count": self.null_count(),
                 "sum": self.sum(),
+                "mean": self.mean(),
             }
         elif self.dtype == String:
             stats_dtype = Int64

@@ -145,6 +145,10 @@ impl PySeries {
         }
     }
 
+    fn str_value(&self, index: usize) -> String {
+        self.series.str_value(index).unwrap().into()
+    }
+
     fn rechunk(&mut self, in_place: bool) -> Option<Self> {
         let series = self.series.rechunk();
         if in_place {

@@ -48,7 +48,7 @@ def test_df_describe() -> None:
                 3.0,
             ],
             "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
-            "c": ["3", "0", None, None, "False", None, None, None, "True"],
+            "c": ["3", "0", "0.666667", None, "false", None, None, None, "true"],
             "d": ["2", "1", None, None, "b", None, None, None, "c"],
             "e": ["2", "1", None, None, None, None, None, None, None],
             "f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"],

@@ -61,11 +61,7 @@ def test_series_describe_boolean() -> None:
     s = pl.Series([True, False, None, True, True])
     result = s.describe()
 
-    stats = {
-        "count": 4,
-        "null_count": 1,
-        "sum": 3,
-    }
+    stats = {"count": 4, "null_count": 1, "sum": 3, "mean": 0.75}
     expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
     assert_frame_equal(expected, result)