Skip to content

Commit

Permalink
feat(python): support mean for bool columns in DataFrame.describe
Browse files Browse the repository at this point in the history
  • Loading branch information
taki committed Jan 21, 2024
1 parent 988a15a commit 0a701db
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 20 deletions.
43 changes: 24 additions & 19 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4391,6 +4391,9 @@ def describe(
-----
The median is included by default as the 50% percentile.
The mean for boolean columns is the ratio of true values
to the total non-null values.
Warnings
--------
We will never guarantee the output of describe to be stable.
Expand All @@ -4416,29 +4419,30 @@ def describe(
... )
>>> df.describe()
shape: (9, 7)
┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐
│ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │
│ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │
│ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │
└────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘
┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
│ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 4.0 ┆ false ┆ b ┆ eur ┆ 2020-01-01 │
│ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │
│ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ max ┆ 3.0 ┆ 5.0 ┆ true ┆ c ┆ usd ┆ 2022-01-01 │
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
"""
if not self.columns:
msg = "cannot describe a DataFrame without any columns"
raise TypeError(msg)

# Determine which columns should get std/mean/percentile statistics
stat_cols = {c for c, dt in self.schema.items() if dt.is_numeric()}

# Determine bool columns to include mean statistics
bool_cols = {c for c, dt in self.schema.items() if dt.is_bool()}
# Determine metrics and optional/additional percentiles
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
Expand All @@ -4451,7 +4455,9 @@ def describe(
metrics.append("max")

mean_exprs = [
(F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
(
F.col(c).mean() if c in stat_cols or c in bool_cols else F.lit(None)
).alias(f"mean:{c}")
for c in self.columns
]
std_exprs = [
Expand Down Expand Up @@ -4490,14 +4496,13 @@ def describe(
df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
for n in range(len(metrics))
]

# Cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if c in stat_cols else str(v))
else (float(v) if c in stat_cols else pl.Series([v])._str_value(0))
for v in summary[c]
]

Expand Down
5 changes: 5 additions & 0 deletions py-polars/polars/datatypes/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ def is_nested(cls) -> bool:
"""Check whether the data type is a nested type."""
return issubclass(cls, NestedType)

@classmethod
def is_bool(cls) -> bool:
"""Check whether the data type is a boolean type."""
return issubclass(cls, Boolean)


def _custom_reconstruct(
cls: type[Any], base: type[Any], state: Any
Expand Down
11 changes: 11 additions & 0 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,17 @@ def _from_pandas(
pandas_to_pyseries(name, values, nan_to_null=nan_to_null)
)

def _str_value(self, index: int) -> str:
"""
Return a string representation of the element at `index`.
Returns
-------
str
string representation of the value.
"""
return self._s.str_value(index)

def _get_buffer_info(self) -> BufferInfo:
"""
Return pointer, offset, and length information about the underlying buffer.
Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ impl PySeries {
}
}

fn str_value(&self, index: usize) -> String {
self.series.str_value(index).unwrap().into()
}

fn rechunk(&mut self, in_place: bool) -> Option<Self> {
let series = self.series.rechunk();
if in_place {
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_df_describe() -> None:
3.0,
],
"b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
"c": ["3", "0", None, None, "False", None, None, None, "True"],
"c": ["3", "0", "0.666667", None, "false", None, None, None, "true"],
"d": ["2", "1", None, None, "b", None, None, None, "c"],
"e": ["2", "1", None, None, None, None, None, None, None],
"f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"],
Expand Down

0 comments on commit 0a701db

Please sign in to comment.