Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support mean for bool columns in DataFrame.Describe and Series.Describe #13884

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 24 additions & 19 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4391,6 +4391,9 @@ def describe(
-----
The median is included by default as the 50% percentile.

The mean for boolean columns is the ratio of true values
to the total non-null values.

Warnings
--------
We will never guarantee the output of describe to be stable.
Expand All @@ -4416,29 +4419,30 @@ def describe(
... )
>>> df.describe()
shape: (9, 7)
┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐
│ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │
│ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │
│ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │
└────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘
┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
│ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 4.0 ┆ false ┆ b ┆ eur ┆ 2020-01-01 │
│ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │
│ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ max ┆ 3.0 ┆ 5.0 ┆ true ┆ c ┆ usd ┆ 2022-01-01 │
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
"""
if not self.columns:
msg = "cannot describe a DataFrame without any columns"
raise TypeError(msg)

# Determine which columns should get std/mean/percentile statistics
stat_cols = {c for c, dt in self.schema.items() if dt.is_numeric()}

# Determine bool columns to include mean statistics
bool_cols = {c for c, dt in self.schema.items() if dt.is_bool()}
# Determine metrics and optional/additional percentiles
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
Expand All @@ -4451,7 +4455,9 @@ def describe(
metrics.append("max")

mean_exprs = [
(F.col(c).mean() if c in stat_cols else F.lit(None)).alias(f"mean:{c}")
(
F.col(c).mean() if c in stat_cols or c in bool_cols else F.lit(None)
).alias(f"mean:{c}")
for c in self.columns
]
std_exprs = [
Expand Down Expand Up @@ -4490,14 +4496,13 @@ def describe(
df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
for n in range(len(metrics))
]

# Cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if c in stat_cols else str(v))
else (float(v) if c in stat_cols else pl.Series([v])._str_value(0))
for v in summary[c]
]

Expand Down
5 changes: 5 additions & 0 deletions py-polars/polars/datatypes/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ def is_nested(cls) -> bool:
"""Check whether the data type is a nested type."""
return issubclass(cls, NestedType)

@classmethod
def is_bool(cls) -> bool:
"""Check whether the data type is a boolean type."""
return issubclass(cls, Boolean)


def _custom_reconstruct(
cls: type[Any], base: type[Any], state: Any
Expand Down
38 changes: 37 additions & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,17 @@ def _from_pandas(
pandas_to_pyseries(name, values, nan_to_null=nan_to_null)
)

def _str_value(self, index: int) -> str:
"""
Return a string representation of the element at `index`.

Returns
-------
str
string representation of the value.
"""
return self._s.str_value(index)

def _get_buffer_info(self) -> BufferInfo:
"""
Return pointer, offset, and length information about the underlying buffer.
Expand Down Expand Up @@ -1888,11 +1899,21 @@ def describe(
-----
The median is included by default as the 50% percentile.

The mean for boolean series is the ratio of true values
to the total non-null values.


Returns
-------
DataFrame
Mapping with summary statistics of a Series.

Warnings
--------
We will never guarantee the output of describe to be stable.
It will show statistics that we deem informative and may
be updated in the future.

Examples
--------
>>> s = pl.Series([1, 2, 3, 4, 5])
Expand All @@ -1914,6 +1935,20 @@ def describe(
│ max ┆ 5.0 │
└────────────┴──────────┘

>>> s = pl.Series([True, False, True, None, True])
>>> s.describe()
shape: (4, 2)
┌────────────┬───────┐
│ statistic ┆ value │
│ --- ┆ --- │
│ str ┆ f64 │
╞════════════╪═══════╡
│ count ┆ 4.0 │
│ null_count ┆ 1.0 │
│ sum ┆ 3.0 │
│ mean ┆ 0.75 │
└────────────┴───────┘

Non-numeric data types may not have all statistics available.

>>> s = pl.Series(["a", "a", None, "b", "c"])
Expand Down Expand Up @@ -1946,11 +1981,12 @@ def describe(
stats["max"] = self.max()

elif self.dtype == Boolean:
stats_dtype = Int64
stats_dtype = Float64
stats = {
"count": self.count(),
"null_count": self.null_count(),
"sum": self.sum(),
"mean": self.mean(),
}
elif self.dtype == String:
stats_dtype = Int64
Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ impl PySeries {
}
}

fn str_value(&self, index: usize) -> String {
self.series.str_value(index).unwrap().into()
}

fn rechunk(&mut self, in_place: bool) -> Option<Self> {
let series = self.series.rechunk();
if in_place {
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_df_describe() -> None:
3.0,
],
"b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
"c": ["3", "0", None, None, "False", None, None, None, "True"],
"c": ["3", "0", "0.666667", None, "false", None, None, None, "true"],
"d": ["2", "1", None, None, "b", None, None, None, "c"],
"e": ["2", "1", None, None, None, None, None, None, None],
"f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"],
Expand Down
6 changes: 1 addition & 5 deletions py-polars/tests/unit/series/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,7 @@ def test_series_describe_boolean() -> None:
s = pl.Series([True, False, None, True, True])
result = s.describe()

stats = {
"count": 4,
"null_count": 1,
"sum": 3,
}
stats = {"count": 4, "null_count": 1, "sum": 3, "mean": 0.75}
expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
assert_frame_equal(expected, result)

Expand Down
Loading