fix: Output correct dtype for mean_horizontal on a single column (#…

…15118)
pola-rs · Apr 13, 2024 · 34ae669 · 34ae669
1 parent ded6302
commit 34ae669
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 2 deletions.
diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs
@@ -2563,7 +2563,12 @@ impl DataFrame {
     pub fn mean_horizontal(&self, null_strategy: NullStrategy) -> PolarsResult<Option<Series>> {
         match self.columns.len() {
             0 => Ok(None),
-            1 => Ok(Some(self.columns[0].clone())),
+            1 => Ok(Some(match self.columns[0].dtype() {
+                dt if dt != &DataType::Float32 && (dt.is_numeric() || dt == &DataType::Boolean) => {
+                    self.columns[0].cast(&DataType::Float64)?
+                },
+                _ => self.columns[0].clone(),
+            })),
             _ => {
                 let columns = self
                     .columns

diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/test_aggregations.py
@@ -12,6 +12,8 @@
 if TYPE_CHECKING:
     import numpy.typing as npt
 
+    from polars.type_aliases import PolarsDataType
+
 
 def test_quantile_expr_input() -> None:
     df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [0.0, 0.0, 0.3, 0.2, 0.0]})
@@ -471,3 +473,60 @@ def test_grouping_hash_14749() -> None:
         .select(pl.col("x").max().over("grp"))["x"]
         .value_counts()
     ).to_dict(as_series=False) == {"x": [3], "count": [1004]}
+
+
+@pytest.mark.parametrize(
+    ("in_dtype", "out_dtype"),
+    [
+        (pl.Boolean, pl.Float64),
+        (pl.UInt8, pl.Float64),
+        (pl.UInt16, pl.Float64),
+        (pl.UInt32, pl.Float64),
+        (pl.UInt64, pl.Float64),
+        (pl.Int8, pl.Float64),
+        (pl.Int16, pl.Float64),
+        (pl.Int32, pl.Float64),
+        (pl.Int64, pl.Float64),
+        (pl.Float32, pl.Float32),
+        (pl.Float64, pl.Float64),
+    ],
+)
+def test_horizontal_mean_single_column(
+    in_dtype: PolarsDataType,
+    out_dtype: PolarsDataType,
+) -> None:
+    out = (
+        pl.LazyFrame({"a": pl.Series([1, 0], dtype=in_dtype)})
+        .select(pl.mean_horizontal(pl.all()))
+        .collect()
+    )
+
+    assert_frame_equal(out, pl.DataFrame({"a": pl.Series([1.0, 0.0], dtype=out_dtype)}))
+
+
+def test_horizontal_mean_in_groupby_15115() -> None:
+    nbr_records = 1000
+    out = (
+        pl.LazyFrame(
+            {
+                "w": [None, "one", "two", "three"] * nbr_records,
+                "x": [None, None, "two", "three"] * nbr_records,
+                "y": [None, None, None, "three"] * nbr_records,
+                "z": [None, None, None, None] * nbr_records,
+            }
+        )
+        .select(pl.mean_horizontal(pl.all().is_null()).alias("mean_null"))
+        .group_by("mean_null")
+        .len()
+        .sort(by="mean_null")
+        .collect()
+    )
+    assert_frame_equal(
+        out,
+        pl.DataFrame(
+            {
+                "mean_null": pl.Series([0.25, 0.5, 0.75, 1.0], dtype=pl.Float64),
+                "len": pl.Series([nbr_records] * 4, dtype=pl.UInt32),
+            }
+        ),
+    )
diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py
@@ -2,13 +2,16 @@
 
 from collections import OrderedDict
 from datetime import date, timedelta
-from typing import Any, Iterator, Mapping
+from typing import TYPE_CHECKING, Any, Iterator, Mapping
 
 import pytest
 
 import polars as pl
 from polars.testing import assert_frame_equal, assert_series_equal
 
+if TYPE_CHECKING:
+    from polars.type_aliases import PolarsDataType
+
 
 class CustomSchema(Mapping[str, Any]):
     """Dummy schema object for testing compatibility with Mapping."""
@@ -640,6 +643,33 @@ def test_schema_boolean_sum_horizontal() -> None:
     assert lf.schema == OrderedDict([("a", pl.UInt32)])
 
 
+@pytest.mark.parametrize(
+    ("in_dtype", "out_dtype"),
+    [
+        (pl.Boolean, pl.Float64),
+        (pl.UInt8, pl.Float64),
+        (pl.UInt16, pl.Float64),
+        (pl.UInt32, pl.Float64),
+        (pl.UInt64, pl.Float64),
+        (pl.Int8, pl.Float64),
+        (pl.Int16, pl.Float64),
+        (pl.Int32, pl.Float64),
+        (pl.Int64, pl.Float64),
+        (pl.Float32, pl.Float32),
+        (pl.Float64, pl.Float64),
+    ],
+)
+def test_schema_mean_horizontal_single_column(
+    in_dtype: PolarsDataType,
+    out_dtype: PolarsDataType,
+) -> None:
+    lf = pl.LazyFrame({"a": pl.Series([1, 0], dtype=in_dtype)}).select(
+        pl.mean_horizontal(pl.all())
+    )
+
+    assert lf.schema == OrderedDict([("a", out_dtype)])
+
+
 def test_struct_alias_prune_15401() -> None:
     df = pl.DataFrame({"a": []}, schema={"a": pl.Struct({"b": pl.Int8})})
     assert df.select(pl.col("a").alias("c").struct.field("b")).columns == ["b"]