From 928854d8b1821dc15e8a32b545d95742d0a7139a Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Sun, 11 Feb 2024 22:48:33 -0500 Subject: [PATCH 1/8] Infer `values` columns when absent --- crates/polars-lazy/src/frame/pivot.rs | 28 ++-- crates/polars-ops/src/frame/pivot/mod.rs | 73 +++++---- crates/polars/tests/it/core/pivot.rs | 155 ++++++++++-------- .../rust/user-guide/transformations/pivot.rs | 15 +- py-polars/polars/dataframe/frame.py | 23 +-- py-polars/src/dataframe.rs | 5 +- py-polars/tests/unit/operations/test_pivot.py | 85 ++++++---- 7 files changed, 225 insertions(+), 159 deletions(-) diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs index c9e0339593db..e7254ea0d908 100644 --- a/crates/polars-lazy/src/frame/pivot.rs +++ b/crates/polars-lazy/src/frame/pivot.rs @@ -31,11 +31,11 @@ impl PhysicalAggExpr for PivotExpr { } } -pub fn pivot( +pub fn pivot( df: &DataFrame, - values: I0, - index: I1, - columns: I2, + index: I0, + columns: I1, + values: Option, sort_columns: bool, agg_expr: Option, // used as separator/delimiter in generated column names. @@ -43,10 +43,10 @@ pub fn pivot( ) -> PolarsResult where I0: IntoIterator, - S0: AsRef, I1: IntoIterator, - S1: AsRef, I2: IntoIterator, + S0: AsRef, + S1: AsRef, S2: AsRef, { // make sure that the root column is replaced @@ -56,20 +56,20 @@ where }); polars_ops::pivot::pivot( df, - values, index, columns, + values, sort_columns, agg_expr, separator, ) } -pub fn pivot_stable( +pub fn pivot_stable( df: &DataFrame, - values: I0, - index: I1, - columns: I2, + index: I0, + columns: I1, + values: Option, sort_columns: bool, agg_expr: Option, // used as separator/delimiter in generated column names. @@ -77,10 +77,10 @@ pub fn pivot_stable( ) -> PolarsResult where I0: IntoIterator, - S0: AsRef, I1: IntoIterator, - S1: AsRef, I2: IntoIterator, + S0: AsRef, + S1: AsRef, S2: AsRef, { // make sure that the root column is replaced @@ -90,9 +90,9 @@ where }); polars_ops::pivot::pivot_stable( df, - values, index, columns, + values, sort_columns, agg_expr, separator, diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index 0b5bb067bf67..a9c8baa3bef6 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -82,27 +82,23 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series { /// # Note /// Polars'/arrow memory is not ideal for transposing operations like pivots. /// If you have a relatively large table, consider using a group_by over a pivot. -pub fn pivot( +pub fn pivot( pivot_df: &DataFrame, - values: I0, - index: I1, - columns: I2, + index: I0, + columns: I1, + values: Option, sort_columns: bool, agg_fn: Option, separator: Option<&str>, ) -> PolarsResult where I0: IntoIterator, - S0: AsRef, I1: IntoIterator, - S1: AsRef, I2: IntoIterator, + S0: AsRef, + S1: AsRef, S2: AsRef, { - let values = values - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); let index = index .into_iter() .map(|s| s.as_ref().to_string()) @@ -111,11 +107,24 @@ where .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); + let values = match values { + Some(v) => v + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect::>(), + None => pivot_df + // No value columns provided, use remaining columns + .get_column_names() + .into_iter() + .map(|s| s.to_string()) + .filter(|s| !(index.contains(s) | columns.contains(s))) + .collect(), + }; pivot_impl( pivot_df, - &values, &index, &columns, + &values, agg_fn, sort_columns, false, @@ -128,27 +137,23 @@ where /// # Note /// Polars'/arrow memory is not ideal for transposing operations like pivots. /// If you have a relatively large table, consider using a group_by over a pivot. -pub fn pivot_stable( +pub fn pivot_stable( pivot_df: &DataFrame, - values: I0, - index: I1, - columns: I2, + index: I0, + columns: I1, + values: Option, sort_columns: bool, agg_fn: Option, separator: Option<&str>, ) -> PolarsResult where I0: IntoIterator, - S0: AsRef, I1: IntoIterator, - S1: AsRef, I2: IntoIterator, + S0: AsRef, + S1: AsRef, S2: AsRef, { - let values = values - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); let index = index .into_iter() .map(|s| s.as_ref().to_string()) @@ -157,12 +162,24 @@ where .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - + let values = match values { + Some(v) => v + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect::>(), + None => pivot_df + // No value columns provided, use remaining columns + .get_column_names() + .into_iter() + .map(|s| s.to_string()) + .filter(|s| !(index.contains(s) | columns.contains(s))) + .collect(), + }; pivot_impl( pivot_df, - &values, &index, &columns, + &values, agg_fn, sort_columns, true, @@ -173,13 +190,13 @@ where #[allow(clippy::too_many_arguments)] fn pivot_impl( pivot_df: &DataFrame, - // these columns will be aggregated in the nested group_by - values: &[String], // keys of the first group_by operation index: &[String], // these columns will be used for a nested group_by // the rows of this nested group_by will be pivoted as header column values columns: &[String], + // these columns will be aggregated in the nested group_by + values: &[String], // aggregation function agg_fn: Option, sort_columns: bool, @@ -206,9 +223,9 @@ fn pivot_impl( let pivot_df = unsafe { binding.with_column_unchecked(columns_struct) }; pivot_impl_single_column( pivot_df, + index, &column, values, - index, agg_fn, sort_columns, separator, @@ -216,9 +233,9 @@ fn pivot_impl( } else { pivot_impl_single_column( pivot_df, + index, unsafe { columns.get_unchecked(0) }, values, - index, agg_fn, sort_columns, separator, @@ -228,9 +245,9 @@ fn pivot_impl( fn pivot_impl_single_column( pivot_df: &DataFrame, + index: &[String], column: &str, values: &[String], - index: &[String], agg_fn: Option, sort_columns: bool, separator: Option<&str>, diff --git a/crates/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs index ce1bee178557..a7c29e87bcde 100644 --- a/crates/polars/tests/it/core/pivot.rs +++ b/crates/polars/tests/it/core/pivot.rs @@ -6,29 +6,45 @@ use polars_ops::pivot::{pivot, pivot_stable, PivotAgg}; #[cfg(feature = "dtype-date")] fn test_pivot_date_() -> PolarsResult<()> { let mut df = df![ - "A" => [1, 1, 1, 1, 1, 1, 1, 1], - "B" => [8, 2, 3, 6, 3, 6, 2, 2], - "C" => [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000] + "index" => [8, 2, 3, 6, 3, 6, 2, 2], + "columns" => [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "values" => [1, 1, 1, 1, 1, 1, 1, 1], ]?; - df.try_apply("C", |s| s.cast(&DataType::Date))?; + df.try_apply("columns", |s| s.cast(&DataType::Date))?; - let out = pivot(&df, ["A"], ["B"], ["C"], true, Some(PivotAgg::Count), None)?; + let out = pivot( + &df, + ["index"], + ["columns"], + Some(["values"]), + true, + Some(PivotAgg::Count), + None, + )?; let first = 1 as IdxSize; let expected = df![ - "B" => [8i32, 2, 3, 6], + "index" => [8i32, 2, 3, 6], "1972-09-27" => [first, 3, 2, 2] ]?; assert!(out.equals_missing(&expected)); - let mut out = pivot_stable(&df, ["C"], ["B"], ["A"], true, Some(PivotAgg::First), None)?; + let mut out = pivot_stable( + &df, + ["index"], + ["values"], + Some(["columns"]), // swapped on purpose + true, + Some(PivotAgg::First), + None, + )?; out.try_apply("1", |s| { let ca = s.date()?; Ok(ca.to_string("%Y-%d-%m")) })?; let expected = df![ - "B" => [8i32, 2, 3, 6], + "index" => [8i32, 2, 3, 6], "1" => ["1972-27-09", "1972-27-09", "1972-27-09", "1972-27-09"] ]?; assert!(out.equals_missing(&expected)); @@ -38,31 +54,31 @@ fn test_pivot_date_() -> PolarsResult<()> { #[test] fn test_pivot_old() { - let s0 = Series::new("foo", ["A", "A", "B", "B", "C"].as_ref()); - let s1 = Series::new("N", [1, 2, 2, 4, 2].as_ref()); - let s2 = Series::new("bar", ["k", "l", "m", "m", "l"].as_ref()); + let s0 = Series::new("index", ["A", "A", "B", "B", "C"].as_ref()); + let s2 = Series::new("columns", ["k", "l", "m", "m", "l"].as_ref()); + let s1 = Series::new("values", [1, 2, 2, 4, 2].as_ref()); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let pvt = pivot( &df, - ["N"], - ["foo"], - ["bar"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::Sum), None, ) .unwrap(); - assert_eq!(pvt.get_column_names(), &["foo", "k", "l", "m"]); + assert_eq!(pvt.get_column_names(), &["index", "k", "l", "m"]); assert_eq!( Vec::from(&pvt.column("m").unwrap().i32().unwrap().sort(false)), &[None, None, Some(6)] ); let pvt = pivot( &df, - ["N"], - ["foo"], - ["bar"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::Min), None, @@ -74,9 +90,9 @@ fn test_pivot_old() { ); let pvt = pivot( &df, - ["N"], - ["foo"], - ["bar"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::Max), None, @@ -88,9 +104,9 @@ fn test_pivot_old() { ); let pvt = pivot( &df, - ["N"], - ["foo"], - ["bar"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::Mean), None, @@ -102,9 +118,9 @@ fn test_pivot_old() { ); let pvt = pivot( &df, - ["N"], - ["foo"], - ["bar"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::Count), None, @@ -120,46 +136,51 @@ fn test_pivot_old() { #[cfg(feature = "dtype-categorical")] fn test_pivot_categorical() -> PolarsResult<()> { let mut df = df![ - "A" => [1, 1, 1, 1, 1, 1, 1, 1], - "B" => [8, 2, 3, 6, 3, 6, 2, 2], - "C" => ["a", "b", "c", "a", "b", "c", "a", "b"] + "index" => [1, 1, 1, 1, 1, 1, 1, 1], + "columns" => ["a", "b", "c", "a", "b", "c", "a", "b"], + "values" => [8, 2, 3, 6, 3, 6, 2, 2], ]?; - df.try_apply("C", |s| { + df.try_apply("columns", |s| { s.cast(&DataType::Categorical(None, Default::default())) })?; - let out = pivot(&df, ["A"], ["B"], ["C"], true, Some(PivotAgg::Count), None)?; - assert_eq!(out.get_column_names(), &["B", "a", "b", "c"]); + let out = pivot( + &df, + ["index"], + ["columns"], + Some(["values"]), + true, + Some(PivotAgg::Count), + None, + )?; + assert_eq!(out.get_column_names(), &["index", "a", "b", "c"]); Ok(()) } #[test] fn test_pivot_new() -> PolarsResult<()> { - let df = df!["A"=> ["foo", "foo", "foo", "foo", "foo", - "bar", "bar", "bar", "bar"], - "B"=> ["one", "one", "one", "two", "two", - "one", "one", "two", "two"], - "C"=> ["small", "large", "large", "small", - "small", "large", "small", "small", "large"], - "breaky"=> ["jam", "egg", "egg", "egg", - "jam", "jam", "potato", "jam", "jam"], - "D"=> [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E"=> [2, 4, 5, 5, 6, 6, 8, 9, 9] + let df = df![ + "index1"=> ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "index2"=> ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "cols1"=> ["small", "large", "large", "small", "small", "large", "small", "small", "large"], + "cols2"=> ["jam", "egg", "egg", "egg", "jam", "jam", "potato", "jam", "jam"], + "values1"=> [1, 2, 2, 3, 3, 4, 5, 6, 7], + "values2"=> [2, 4, 5, 5, 6, 6, 8, 9, 9] ]?; let out = (pivot_stable( &df, - ["D"], - ["A", "B"], - ["C"], + ["index1", "index2"], + ["cols1"], + Some(["values1"]), true, Some(PivotAgg::Sum), None, ))?; let expected = df![ - "A" => ["foo", "foo", "bar", "bar"], - "B" => ["one", "two", "one", "two"], + "index1" => ["foo", "foo", "bar", "bar"], + "index2" => ["one", "two", "one", "two"], "large" => [Some(4), None, Some(4), Some(7)], "small" => [1, 6, 5, 6], ]?; @@ -167,16 +188,16 @@ fn test_pivot_new() -> PolarsResult<()> { let out = pivot_stable( &df, - ["D"], - ["A", "B"], - ["C", "breaky"], + ["index1", "index2"], + ["cols1", "cols2"], + Some(["values1"]), true, Some(PivotAgg::Sum), None, )?; let expected = df![ - "A" => ["foo", "foo", "bar", "bar"], - "B" => ["one", "two", "one", "two"], + "index1" => ["foo", "foo", "bar", "bar"], + "index2" => ["one", "two", "one", "two"], "{\"large\",\"egg\"}" => [Some(4), None, None, None], "{\"large\",\"jam\"}" => [None, None, Some(4), Some(7)], "{\"small\",\"egg\"}" => [None, Some(3), None, None], @@ -191,22 +212,22 @@ fn test_pivot_new() -> PolarsResult<()> { #[test] fn test_pivot_2() -> PolarsResult<()> { let df = df![ - "name"=> ["avg", "avg", "act", "test", "test"], - "err" => [Some("name1"), Some("name2"), None, Some("name1"), Some("name2")], - "wght"=> [0.0, 0.1, 1.0, 0.4, 0.2] + "index" => [Some("name1"), Some("name2"), None, Some("name1"), Some("name2")], + "columns"=> ["avg", "avg", "act", "test", "test"], + "values"=> [0.0, 0.1, 1.0, 0.4, 0.2] ]?; let out = pivot_stable( &df, - ["wght"], - ["err"], - ["name"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::First), None, )?; let expected = df![ - "err" => [Some("name1"), Some("name2"), None], + "index" => [Some("name1"), Some("name2"), None], "avg" => [Some(0.0), Some(0.1), None], "act" => [None, None, Some(1.)], "test" => [Some(0.4), Some(0.2), None], @@ -224,22 +245,22 @@ fn test_pivot_datetime() -> PolarsResult<()> { .and_hms_opt(12, 15, 0) .unwrap(); let df = df![ - "dt" => [dt, dt, dt, dt], - "key" => ["x", "x", "y", "y"], - "val" => [100, 50, 500, -80] + "index" => [dt, dt, dt, dt], + "columns" => ["x", "x", "y", "y"], + "values" => [100, 50, 500, -80] ]?; let out = pivot( &df, - ["val"], - ["dt"], - ["key"], + ["index"], + ["columns"], + Some(["values"]), false, Some(PivotAgg::Sum), None, )?; let expected = df![ - "dt" => [dt], + "index" => [dt], "x" => [150], "y" => [420] ]?; diff --git a/docs/src/rust/user-guide/transformations/pivot.rs b/docs/src/rust/user-guide/transformations/pivot.rs index 2115b528579c..804ead13f056 100644 --- a/docs/src/rust/user-guide/transformations/pivot.rs +++ b/docs/src/rust/user-guide/transformations/pivot.rs @@ -7,20 +7,29 @@ fn main() -> Result<(), Box> { // --8<-- [start:df] let df = df!( "foo"=> ["A", "A", "B", "B", "C"], - "N"=> [1, 2, 2, 4, 2], "bar"=> ["k", "l", "m", "n", "o"], + "N"=> [1, 2, 2, 4, 2], )?; println!("{}", &df); // --8<-- [end:df] // --8<-- [start:eager] - let out = pivot(&df, ["N"], ["foo"], ["bar"], false, None, None)?; + let out = pivot(&df, ["foo"], ["bar"], Some(["N"]), false, None, None)?; println!("{}", &out); // --8<-- [end:eager] // --8<-- [start:lazy] let q = df.lazy(); - let q2 = pivot(&q.collect()?, ["N"], ["foo"], ["bar"], false, None, None)?.lazy(); + let q2 = pivot( + &q.collect()?, + ["foo"], + ["bar"], + Some(["N"]), + false, + None, + None, + )? + .lazy(); let out = q2.collect()?; println!("{}", &out); // --8<-- [end:lazy] diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 8fc30f2ffd1d..98786befec8d 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7246,20 +7246,11 @@ def explode( """ return self.lazy().explode(columns, *more_columns).collect(_eager=True) - @deprecate_nonkeyword_arguments( - allowed_args=["self"], - message=( - "The order of the parameters of `pivot` will change in the next breaking release." - " The order will become `index, columns, values` with `values` as an optional parameter." - " Use keyword arguments to silence this warning." - ), - version="0.20.8", - ) def pivot( self, - values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, + values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None, aggregate_function: PivotAgg | Expr | None = None, *, maintain_order: bool = True, @@ -7274,14 +7265,15 @@ def pivot( Parameters ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. index One or multiple keys to group by. columns Name of the column(s) whose values will be used as the header of the output DataFrame. + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. If None, all remaining columns + will be used. aggregate_function Choose from: @@ -7392,9 +7384,10 @@ def pivot( │ b ┆ 0.964028 ┆ 0.999954 │ └──────┴──────────┴──────────┘ """ # noqa: W505 - values = _expand_selectors(self, values) index = _expand_selectors(self, index) columns = _expand_selectors(self, columns) + if values is not None: + values = _expand_selectors(self, values) if isinstance(aggregate_function, str): if aggregate_function == "first": @@ -7430,9 +7423,9 @@ def pivot( return self._from_pydf( self._df.pivot_expr( - values, index, columns, + values, maintain_order, sort_columns, aggregate_expr, diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 3fb1b284b04c..b357954eb98f 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -1218,11 +1218,12 @@ impl PyDataFrame { } #[cfg(feature = "pivot")] + #[pyo3(signature = (index, columns, values, maintain_order, sort_columns, aggregate_expr, separator))] pub fn pivot_expr( &self, - values: Vec, index: Vec, columns: Vec, + values: Option>, maintain_order: bool, sort_columns: bool, aggregate_expr: Option, @@ -1232,9 +1233,9 @@ impl PyDataFrame { let agg_expr = aggregate_expr.map(|expr| expr.inner); let df = fun( &self.df, - values, index, columns, + values, sort_columns, agg_expr, separator, diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index 5d0b4a6e69f1..be15f176b6d4 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -18,8 +18,8 @@ def test_pivot() -> None: df = pl.DataFrame( { "foo": ["A", "A", "B", "B", "C"], - "N": [1, 2, 2, 4, 2], "bar": ["k", "l", "m", "n", "o"], + "N": [1, 2, 2, 4, 2], } ) result = df.pivot(index="foo", columns="bar", values="N", aggregate_function=None) @@ -35,6 +35,34 @@ def test_pivot() -> None: assert_frame_equal(result, expected) +def test_pivot_no_values() -> None: + df = pl.DataFrame( + { + "foo": ["A", "A", "B", "B", "C"], + "bar": ["k", "l", "m", "n", "o"], + "N1": [1, 2, 2, 4, 2], + "N2": [1, 2, 2, 4, 2], + } + ) + result = df.pivot(index="foo", columns="bar", aggregate_function=None) + expected = pl.DataFrame( + { + "foo": ["A", "B", "C"], + "N1_bar_k": [1, None, None], + "N1_bar_l": [2, None, None], + "N1_bar_m": [None, 2, None], + "N1_bar_n": [None, 4, None], + "N1_bar_o": [None, None, 2], + "N2_bar_k": [1, None, None], + "N2_bar_l": [2, None, None], + "N2_bar_m": [None, 2, None], + "N2_bar_n": [None, 4, None], + "N2_bar_o": [None, None, 2], + } + ) + assert_frame_equal(result, expected) + + def test_pivot_list() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [[1, 1], [2, 2], [3, 3]]}) @@ -47,9 +75,9 @@ def test_pivot_list() -> None: } ) out = df.pivot( + values="b", index="a", columns="a", - values="b", aggregate_function="first", sort_columns=True, ) @@ -77,7 +105,7 @@ def test_pivot_aggregate(agg_fn: PivotAgg, expected_rows: list[tuple[Any]]) -> N } ) result = df.pivot( - values="c", index="b", columns="a", aggregate_function=agg_fn, sort_columns=True + index="b", columns="a", values="c", aggregate_function=agg_fn, sort_columns=True ) assert result.rows() == expected_rows @@ -110,12 +138,12 @@ def test_pivot_categorical_index() -> None: schema=[("A", pl.Categorical), ("B", pl.Categorical)], ) - result = df.pivot(values="B", index=["A"], columns="B", aggregate_function="len") + result = df.pivot(index=["A"], columns="B", values="B", aggregate_function="len") expected = {"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, None]} assert result.to_dict(as_series=False) == expected # test expression dispatch - result = df.pivot(values="B", index=["A"], columns="B", aggregate_function=pl.len()) + result = df.pivot(index=["A"], columns="B", values="B", aggregate_function=pl.len()) assert result.to_dict(as_series=False) == expected df = pl.DataFrame( @@ -127,7 +155,7 @@ def test_pivot_categorical_index() -> None: schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)], ) result = df.pivot( - values="B", index=["A", "C"], columns="B", aggregate_function="len" + index=["A", "C"], columns="B", values="B", aggregate_function="len" ) expected = { "A": ["Fire", "Water"], @@ -150,17 +178,17 @@ def test_pivot_multiple_values_column_names_5116() -> None: with pytest.raises(ComputeError, match="found multiple elements in the same group"): df.pivot( - values=["x1", "x2"], index="c1", columns="c2", + values=["x1", "x2"], separator="|", aggregate_function=None, ) result = df.pivot( - values=["x1", "x2"], index="c1", columns="c2", + values=["x1", "x2"], separator="|", aggregate_function="first", ) @@ -185,9 +213,9 @@ def test_pivot_duplicate_names_7731() -> None: } ) result = df.pivot( - values=cs.integer(), index=cs.float(), columns=cs.string(), + values=cs.integer(), aggregate_function="first", ).to_dict(as_series=False) expected = { @@ -202,7 +230,7 @@ def test_pivot_duplicate_names_7731() -> None: def test_pivot_duplicate_names_11663() -> None: df = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": ["x", "x"], "d": ["x", "y"]}) - result = df.pivot(values="a", index="b", columns=["c", "d"]).to_dict( + result = df.pivot(index="b", columns=["c", "d"], values="a").to_dict( as_series=False ) expected = {"b": [1, 2], '{"x","x"}': [1, None], '{"x","y"}': [None, 2]} @@ -220,7 +248,7 @@ def test_pivot_multiple_columns_12407() -> None: } ) result = df.pivot( - values=["a"], index="b", columns=["c", "e"], aggregate_function="len" + index="b", columns=["c", "e"], values=["a"], aggregate_function="len" ).to_dict(as_series=False) expected = {"b": ["a", "b"], '{"s","x"}': [1, None], '{"f","y"}': [None, 1]} assert result == expected @@ -254,7 +282,7 @@ def test_pivot_index_struct_14101() -> None: "d": [1, 1, 3], } ) - result = df.pivot(index="b", values="a", columns="c") + result = df.pivot(index="b", columns="c", values="a") expected = pl.DataFrame({"b": [{"a": 1}, {"a": 2}], "x": [1, None], "y": [2, 1]}) assert_frame_equal(result, expected) @@ -289,11 +317,11 @@ def test_pivot_floats() -> None: with pytest.raises(ComputeError, match="found multiple elements in the same group"): result = df.pivot( - values="price", index="weight", columns="quantity", aggregate_function=None + index="weight", columns="quantity", values="price", aggregate_function=None ) result = df.pivot( - values="price", index="weight", columns="quantity", aggregate_function="first" + index="weight", columns="quantity", values="price", aggregate_function="first" ) expected = { "weight": [1.0, 4.4, 8.8], @@ -304,9 +332,9 @@ def test_pivot_floats() -> None: assert result.to_dict(as_series=False) == expected result = df.pivot( - values="price", index=["article", "weight"], columns="quantity", + values="price", aggregate_function=None, ) expected = { @@ -329,12 +357,21 @@ def test_pivot_reinterpret_5907() -> None: ) result = df.pivot( - index=["A"], values=["C"], columns=["B"], aggregate_function=pl.element().sum() + index=["A"], columns=["B"], values=["C"], aggregate_function=pl.element().sum() ) expected = {"A": [3, -2], "x": [100, 50], "y": [500, -80]} assert result.to_dict(as_series=False) == expected +def test_pivot_subclassed_df() -> None: + class SubClassedDataFrame(pl.DataFrame): + pass + + df = SubClassedDataFrame({"a": [1, 2], "b": [3, 4]}) + result = df.pivot(index="a", columns="a", values="b", aggregate_function="first") + assert isinstance(result, SubClassedDataFrame) + + def test_pivot_temporal_logical_types() -> None: date_lst = [datetime(_, 1, 1) for _ in range(1960, 1980)] @@ -389,19 +426,7 @@ def test_aggregate_function_default() -> None: with pytest.raises( pl.ComputeError, match="found multiple elements in the same group" ): - df.pivot(values="a", index="b", columns="c") - - -def test_pivot_positional_args_deprecated() -> None: - df = pl.DataFrame( - { - "foo": ["A", "A", "B", "B", "C"], - "N": [1, 2, 2, 4, 2], - "bar": ["k", "l", "m", "n", "o"], - } - ) - with pytest.deprecated_call(): - df.pivot("N", "foo", "bar", aggregate_function=None) + df.pivot(index="b", columns="c", values="a") def test_pivot_aggregate_function_count_deprecated() -> None: @@ -467,7 +492,7 @@ def test_multi_index_containing_struct() -> None: "d": [1, 1, 3], } ) - result = df.pivot(index=("b", "d"), values="a", columns="c") + result = df.pivot(index=("b", "d"), columns="c", values="a") expected = pl.DataFrame( {"b": [{"a": 1}, {"a": 2}], "d": [1, 3], "x": [1, None], "y": [2, 1]} ) From ab6c80e63009897c24f41531983af67b626c155d Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Wed, 21 Feb 2024 08:46:20 -0500 Subject: [PATCH 2/8] Remove unwanted test --- py-polars/tests/unit/operations/test_pivot.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index be15f176b6d4..f426d7204344 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -363,15 +363,6 @@ def test_pivot_reinterpret_5907() -> None: assert result.to_dict(as_series=False) == expected -def test_pivot_subclassed_df() -> None: - class SubClassedDataFrame(pl.DataFrame): - pass - - df = SubClassedDataFrame({"a": [1, 2], "b": [3, 4]}) - result = df.pivot(index="a", columns="a", values="b", aggregate_function="first") - assert isinstance(result, SubClassedDataFrame) - - def test_pivot_temporal_logical_types() -> None: date_lst = [datetime(_, 1, 1) for _ in range(1960, 1980)] From 1c1f1bb3bc63b7af4a940489b9a3196ea3ddb753 Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Wed, 21 Feb 2024 08:57:20 -0500 Subject: [PATCH 3/8] Rename variables to clarify test --- crates/polars/tests/it/core/pivot.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/crates/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs index a7c29e87bcde..6f9c996b44cc 100644 --- a/crates/polars/tests/it/core/pivot.rs +++ b/crates/polars/tests/it/core/pivot.rs @@ -7,16 +7,17 @@ use polars_ops::pivot::{pivot, pivot_stable, PivotAgg}; fn test_pivot_date_() -> PolarsResult<()> { let mut df = df![ "index" => [8, 2, 3, 6, 3, 6, 2, 2], - "columns" => [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], - "values" => [1, 1, 1, 1, 1, 1, 1, 1], + "values1" => [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "values2" => [1, 1, 1, 1, 1, 1, 1, 1], ]?; - df.try_apply("columns", |s| s.cast(&DataType::Date))?; + df.try_apply("values1", |s| s.cast(&DataType::Date))?; + // Test with date as the `columns` input let out = pivot( &df, ["index"], - ["columns"], - Some(["values"]), + ["values1"], + Some(["values2"]), true, Some(PivotAgg::Count), None, @@ -29,11 +30,12 @@ fn test_pivot_date_() -> PolarsResult<()> { ]?; assert!(out.equals_missing(&expected)); + // Test with date as the `values` input. let mut out = pivot_stable( &df, ["index"], - ["values"], - Some(["columns"]), // swapped on purpose + ["values2"], + Some(["values1"]), true, Some(PivotAgg::First), None, From 910ba21b2a4592bfc54758b4e05fa5ca76294037 Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Wed, 21 Feb 2024 12:23:46 -0500 Subject: [PATCH 4/8] Use set logic to compute columns --- crates/polars-ops/src/frame/pivot/mod.rs | 67 +++++++++++++++--------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index a9c8baa3bef6..d04afa0dee1a 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -77,6 +77,45 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series { } } +/// Determine `values` columns. +/// +/// When the optional `values` parameter is `None`, we use all remaining columns in the `DataFrame` +/// after `index` and `columns` have been excluded. When `values` is `Some`, we return a vector of +/// strings. +fn _get_values_columns( + df: &DataFrame, + index: &[String], + columns: &[String], + values: Option, +) -> Vec +where + I: IntoIterator, + S: AsRef, +{ + match values { + Some(v) => v + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect::>(), + None => { + let column_names = df.get_column_names_owned(); + let mut column_set = PlHashSet::::with_capacity(column_names.len()); + + // Column names are always unique. + column_names.into_iter().for_each(|s| { + column_set.insert_unique_unchecked(s.to_string()); + }); + + // Remove `index` and `column` columns. + index.iter().chain(columns.iter()).for_each(|s| { + column_set.remove(s); + }); + + column_set.drain().collect() + }, + } +} + /// Do a pivot operation based on the group key, a pivot column and an aggregation function on the values column. /// /// # Note @@ -107,19 +146,7 @@ where .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - let values = match values { - Some(v) => v - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(), - None => pivot_df - // No value columns provided, use remaining columns - .get_column_names() - .into_iter() - .map(|s| s.to_string()) - .filter(|s| !(index.contains(s) | columns.contains(s))) - .collect(), - }; + let values = _get_values_columns(pivot_df, &index, &columns, values); pivot_impl( pivot_df, &index, @@ -162,19 +189,7 @@ where .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - let values = match values { - Some(v) => v - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(), - None => pivot_df - // No value columns provided, use remaining columns - .get_column_names() - .into_iter() - .map(|s| s.to_string()) - .filter(|s| !(index.contains(s) | columns.contains(s))) - .collect(), - }; + let values = _get_values_columns(pivot_df, &index, &columns, values); pivot_impl( pivot_df, &index, From 024e21d79d95313a09c948acebd5d45fb8042c32 Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Wed, 21 Feb 2024 13:45:58 -0500 Subject: [PATCH 5/8] Revert python arg order and fix test --- py-polars/polars/dataframe/frame.py | 19 ++++++++++++++----- py-polars/tests/unit/operations/test_pivot.py | 9 ++++++--- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 98786befec8d..c0807ffa05dc 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7246,11 +7246,20 @@ def explode( """ return self.lazy().explode(columns, *more_columns).collect(_eager=True) + @deprecate_nonkeyword_arguments( + allowed_args=["self"], + message=( + "The order of the parameters of `pivot` will change in the next breaking release." + " The order will become `index, columns, values` with `values` as an optional parameter." + " Use keyword arguments to silence this warning." + ), + version="0.20.8", + ) def pivot( self, + values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, - values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None, aggregate_function: PivotAgg | Expr | None = None, *, maintain_order: bool = True, @@ -7265,15 +7274,15 @@ def pivot( Parameters ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. If None, all remaining columns + will be used. index One or multiple keys to group by. columns Name of the column(s) whose values will be used as the header of the output DataFrame. - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. If None, all remaining columns - will be used. aggregate_function Choose from: diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index f426d7204344..2cba1ee326f9 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -44,7 +44,7 @@ def test_pivot_no_values() -> None: "N2": [1, 2, 2, 4, 2], } ) - result = df.pivot(index="foo", columns="bar", aggregate_function=None) + result = df.pivot(index="foo", columns="bar", values=None, aggregate_function=None) expected = pl.DataFrame( { "foo": ["A", "B", "C"], @@ -60,7 +60,10 @@ def test_pivot_no_values() -> None: "N2_bar_o": [None, None, 2], } ) - assert_frame_equal(result, expected) + + # the order of the output columns is volatile + assert set(result.columns) == set(expected.columns) + assert_frame_equal(result, expected.select(result.columns)) def test_pivot_list() -> None: @@ -75,9 +78,9 @@ def test_pivot_list() -> None: } ) out = df.pivot( - values="b", index="a", columns="a", + values="b", aggregate_function="first", sort_columns=True, ) From ffaed2c5a598506c44de048fe0d425c0b0f29a0e Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Wed, 21 Feb 2024 19:30:51 -0500 Subject: [PATCH 6/8] Preserve column order --- crates/polars-ops/src/frame/pivot/mod.rs | 26 +++++++++---------- py-polars/tests/unit/operations/test_pivot.py | 4 +-- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index d04afa0dee1a..cadecf47b502 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -77,11 +77,10 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series { } } -/// Determine `values` columns. +/// Determine `values` columns, which is optional in `pivot` calls. /// -/// When the optional `values` parameter is `None`, we use all remaining columns in the `DataFrame` -/// after `index` and `columns` have been excluded. When `values` is `Some`, we return a vector of -/// strings. +/// If not specified (i.e. is `None`, we use all remaining columns in the `DataFrame`)after `index` +/// and `columns` have been excluded. fn _get_values_columns( df: &DataFrame, index: &[String], @@ -98,20 +97,19 @@ where .map(|s| s.as_ref().to_string()) .collect::>(), None => { - let column_names = df.get_column_names_owned(); - let mut column_set = PlHashSet::::with_capacity(column_names.len()); + let mut column_set = PlHashSet::::with_capacity(index.len() + columns.len()); - // Column names are always unique. - column_names.into_iter().for_each(|s| { - column_set.insert_unique_unchecked(s.to_string()); - }); - - // Remove `index` and `column` columns. + // Hash columns we don't want to include index.iter().chain(columns.iter()).for_each(|s| { - column_set.remove(s); + column_set.insert_unique_unchecked(s.to_owned()); }); - column_set.drain().collect() + // filter out + df.get_column_names_owned() + .into_iter() + .map(|s| s.to_string()) + .filter(|s| !column_set.contains(s)) + .collect() }, } } diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index 2cba1ee326f9..e9f5617a6066 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -61,9 +61,7 @@ def test_pivot_no_values() -> None: } ) - # the order of the output columns is volatile - assert set(result.columns) == set(expected.columns) - assert_frame_equal(result, expected.select(result.columns)) + assert_frame_equal(result, expected) def test_pivot_list() -> None: From 85d9834a5a597b1b536683e704eafd9b4ceae14e Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 25 Feb 2024 20:40:15 +0100 Subject: [PATCH 7/8] Re-add depr test --- py-polars/tests/unit/operations/test_pivot.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index e9f5617a6066..e5354703052b 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -421,6 +421,18 @@ def test_aggregate_function_default() -> None: df.pivot(index="b", columns="c", values="a") +def test_pivot_positional_args_deprecated() -> None: + df = pl.DataFrame( + { + "foo": ["A", "A", "B", "B", "C"], + "N": [1, 2, 2, 4, 2], + "bar": ["k", "l", "m", "n", "o"], + } + ) + with pytest.deprecated_call(): + df.pivot("N", "foo", "bar", aggregate_function=None) + + def test_pivot_aggregate_function_count_deprecated() -> None: df = pl.DataFrame( { From 53e5293e11f7f2b1459d646b9b80348b72cad168 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 25 Feb 2024 22:36:07 +0100 Subject: [PATCH 8/8] Revert to earlier implementation of values logic --- crates/polars-ops/src/frame/pivot/mod.rs | 66 ++++++++++-------------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index cadecf47b502..cec9ddd01cdb 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -77,43 +77,6 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series { } } -/// Determine `values` columns, which is optional in `pivot` calls. -/// -/// If not specified (i.e. is `None`, we use all remaining columns in the `DataFrame`)after `index` -/// and `columns` have been excluded. -fn _get_values_columns( - df: &DataFrame, - index: &[String], - columns: &[String], - values: Option, -) -> Vec -where - I: IntoIterator, - S: AsRef, -{ - match values { - Some(v) => v - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(), - None => { - let mut column_set = PlHashSet::::with_capacity(index.len() + columns.len()); - - // Hash columns we don't want to include - index.iter().chain(columns.iter()).for_each(|s| { - column_set.insert_unique_unchecked(s.to_owned()); - }); - - // filter out - df.get_column_names_owned() - .into_iter() - .map(|s| s.to_string()) - .filter(|s| !column_set.contains(s)) - .collect() - }, - } -} - /// Do a pivot operation based on the group key, a pivot column and an aggregation function on the values column. /// /// # Note @@ -144,7 +107,7 @@ where .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - let values = _get_values_columns(pivot_df, &index, &columns, values); + let values = get_values_columns(pivot_df, &index, &columns, values); pivot_impl( pivot_df, &index, @@ -187,7 +150,7 @@ where .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - let values = _get_values_columns(pivot_df, &index, &columns, values); + let values = get_values_columns(pivot_df, &index, &columns, values); pivot_impl( pivot_df, &index, @@ -200,6 +163,31 @@ where ) } +/// Determine `values` columns, which is optional in `pivot` calls. +/// +/// If not specified (i.e. is `None`), use all remaining columns in the +/// `DataFrame` after `index` and `columns` have been excluded. +fn get_values_columns( + df: &DataFrame, + index: &[String], + columns: &[String], + values: Option, +) -> Vec +where + I: IntoIterator, + S: AsRef, +{ + match values { + Some(v) => v.into_iter().map(|s| s.as_ref().to_string()).collect(), + None => df + .get_column_names() + .into_iter() + .map(|c| c.to_string()) + .filter(|c| !(index.contains(c) | columns.contains(c))) + .collect(), + } +} + #[allow(clippy::too_many_arguments)] fn pivot_impl( pivot_df: &DataFrame,